使用Xpath解析爬虫
from lxml import etree
xml_str = """
<supermarket>
<name>永辉超市</name>
<address>肖家河大厦</address>
<goodsList>
<goods name="泡面" price="3.5" count="20"></goods>
<goods name="矿泉水" price="2" count="50"></goods>
<goods name="面包" price="5" count="15"></goods>
</goodsList>
<worker_list>
<cashier name="张三" pay="4000"></cashier>
<shoppingGuide name="李四" pay="3500"></shoppingGuide>
</worker_list>
<goods price="50" count="15">
<name>烟</name>
</goods>
</supermarket>
"""
将xml代码转为树结构
创建树对象
supermarket=etree.XML(xml_str)
print(supermarket)
获取标签
节点对象.xpath(路径)——根据路径找到对应节点,并且返回保存节点对象的列表
a.绝对路径
不管xpath前面的节点是什么,路径都是从根节点开始写起
写法:/绝对路print(cashier)径
cashier=supermarket.xpath('/supermarket/worker_list/cashier')
print(cashier)
worker_list=supermarket.xpath('/supermarket/worker_list')[0]
print(worker_list)
cashier=worker_list.xpath('/supermarket/worker_list/cashier')
print(cashier)
b.相对路径
用.来代表当前节点,xpath前面是谁当前节点就是谁
使用…来表示当前节点的上层节点
相对路径中./可以省略
cashier=supermarket.xpath('./worker_list/cashier')
print(cashier)
cashier=worker_list.xpath('./cashier')
print(cashier)
print('-----------')
cashier=worker_list.xpath('cashier')
print(cashier)
c.//路径
从全局任意位置开始查找
查找方式和xpath前节点对象没有关系
cashier=supermarket.xpath('//cashier')
print(cashier)
goods=supermarket.xpath('//goods')
print(goods)
goods=supermarket.xpath('//goodsList/goods')
print(goods)
获取节点内容
语法:获取节点路径/text()
name=supermarket.xpath('./name/text()')
print(name)
address=supermarket.xpath('./address/text()')
print(address)
name=supermarket.xpath('./goods/name/text()')
print(name)
获取节点属性值
语法:获取节点路径/@属性名
price=supermarket.xpath('./goods/@price')
print(price)
count=supermarket.xpath('./goods/@count')
print(count)
price=supermarket.xpath('//goods/@price')
print(price)