1.xpath和xml数据格式
from lxml import etree
"""
树:整个html或xml结构
节点:html中的每个标签,xml中标签就是节点
根节点:树的第一个节点,html的根节点就是html标签
属性:节点属性(html中就是标签属性)
"""
"""
将一个超市的商品数据进行传输:
json:
{
"name": "永辉超市",
"address": "肖家河大厦",
"goods": [
{"name": "泡面", "price": 3.5, "count": 20},
{"name": "矿泉水", "prcie": 2, "count": 50},
{"name": "面包", "price": 5, "count": 15}
]
}
xml:
<supermarket>
<name>永辉超市</name>
<address>肖家河大厦</address>
<goodsList>
<goods name="泡面" price="3.5" count="20"></goods>
<goods name="矿泉水" price="2" count="50"></goods>
<goods name="面包" price="5" count="15"></goods>
</goods>
</supermarket>
"""
xml_str = """
<supermarket>
<name>永辉超市</name>
<address>肖家河大厦</address>
<goodsList>
<goods name="泡面" price="3.5" count="20"></goods>
<goods name="矿泉水" price="2" count="50"></goods>
<goods name="面包" price="5" count="15"></goods>
</goodsList>
<worker_list>
<cashier name="张三" pay="4000"></cashier>
<shoppingGuide name="李四" pay="3500"></shoppingGuide>
</worker_list>
<goods price="50" count="15">
<name>烟</name>
</goods>
</supermarket>
"""
supermarket = etree.XML(xml_str)
print(supermarket)
cashier = supermarket.xpath('/supermarket/worker_list/cashier')
print(cashier)
worker_list = supermarket.xpath('/supermarket/worker_list')[0]
print(worker_list)
result = worker_list.xpath('/worker_list/cashier')
print(result)
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier)
cashier = worker_list.xpath('./cashier')
print(cashier)
cashier = supermarket.xpath('worker_list/cashier')
print(cashier)
cashier = worker_list.xpath('cashier')
print(cashier)
result = supermarket.xpath('//cashier')
print(result)
result = supermarket.xpath('//goods')
print(result)
result = supermarket.xpath('//goodsList/goods')
print(result)
name = supermarket.xpath('./name/text()')
print(name)
names = supermarket.xpath('//name/text()')
print(names)
result = supermarket.xpath('./goods/@price')
print(result)
result = supermarket.xpath('//goods/@price')
print(result)
2.解析html
from lxml import etree
html = etree.HTML(open('test.html', encoding='utf-8').read())
h1 = html.xpath('/html/body/h1')
print(h1)
h1 = html.xpath('./body/h1')
print(h1)
h1 = html.xpath('//h1')
print(h1)
p = html.xpath('./body/p[1]/text()')
print(p)
result = html.xpath('./body/ul/li[2]/p/text()')
print(result)
result = html.xpath('./body/ul/li/p[last()-1]/text()')
print(result)
result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)
result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)
result = html.xpath('./body/ul/li[position()>2]/p/text()')
print(result)
result = html.xpath('./body/div/p[@class]/text()')
print(result)
result = html.xpath('./body/div/p[@class="c1"]/text()')
print(result)
result = html.xpath('//p[@class="c1"]/text()')
print(result)
result = html.xpath('./body/div/p[@id="p1"]/text()')
print(result)
result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)
result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)
result = html.xpath('./body/ul/li[p[1]="面包"]/p/text()')
print(result)
result = html.xpath('./body/div[@id="div1"]/*')
print(result)
result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)
result = html.xpath('//*[@class="c1"]')
print(result)
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)
result = html.xpath('./body/div[last()]/p[@*="p"]/text()')
print(result)
result = html.xpath('//img/@*')
print(result)
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[3]/text()')
print(result)