xpath
xpath是一个解析网页的工具,解析对象是xml文档。
专业术语
树:整个html或xml结构
节点:html中的每个标签,xml中标签就是节点
根节点:树的第一个节点,html的节点就是html标签
属性:节点属性(html终究是标签属性)
from lxml import etree
# xml数据结构
# json数据和xml数据是两种通用的数据模式,用于不同语言之间进行数据交流
"""
将一个超市的商品数据进行传输:
json:
{
"name": "永辉超市",
"address": "撒厚厦大厦",
"goods": [
{"name": "泡面", "price": 3.5 , "count": 20},
{"name": "矿泉水", "price": 2 , "count": 50},
{"name": "面包", "price": 5 , "count": 15},
]
}
xml:
<supermarket>
<name>永辉超市</name>
<address>撒厚厦大厦</address>
<goodslsit>
<goods name="泡面" price= "3.5" count="20"></goods>
<goods name="矿泉水" price= "2" count="50"></goods>
<goods name="面包" price= "5" count="15"></goods>
</goodslist>
</supermarket>
"""
xml_str ="""
<supermarket>
<name>永辉超市</name>
<address>撒厚厦大厦</address>
<goodsList>
<goods name="泡面" price="3.5" count="20"></goods>
<goods name="矿泉水" price="2" count="50"></goods>
<goods name="面包" price="5" count="15"></goods>
</goodsList>
<worker_list>
<cashier name="张三" pay="4000"></cashier>
<shoppingGuide name="李四" pay="3500"></shoppingGuide>
</worker_list>
<goods price="50" count="15">
<name>烟</name>
</goods>
</supermarket>
"""
# 2)创建树对象,并且获取数据的根节点
supermarket = etree.XML(xml_str)
# 3)获取标签(获取节点)
# 节点对象.xpath(路径) - 根据路径找到对应的节点,返回节点对象
# a.写绝对路径:不管xpath前面的节点对象是什么,路径从根节点开始写
# 写法:绝对路径
cashier = supermarket.xpath('/supermarket/worker_list/cashier')
print(cashier)
worker_list = supermarket.xpath('/supermarket/worker_list')[0]
print(worker_list)
result = worker_list.xpath('worker_list/cashier')
print(result)
# 3.相对路径:用.来表示当前节点,xpath前面是谁,当前节点就是谁
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier) # [<Element cashier at 0x233f91290c0>]
cashier = worker_list.xpath('./cashier')
print(cashier) # [<Element cashier at 0x233f91290c0>]
cashier = supermarket.xpath('worker_list/cashier')
print(cashier) # [<Element cashier at 0x233f91290c0>]
cashier = worker_list.xpath('cashier')
print(cashier) # [<Element cashier at 0x233f91290c0>]
# c.//路径 - 从任意位置开始全局搜索
# 查找方式和功能和xpath前的节点无关
result = supermarket.xpath('//cashier')
print(result) # [<Element cashier at 0x1fa8fb79140>]
result = supermarket.xpath('//goods')
print(result) # [<Element goods at 0x24c085e9440>, <Element goods at 0x24c085e9400>, <Element goods at 0x24c085e9480>, <Element goods at 0x24c085e94c0>]
result = supermarket.xpath('//goodsList/goods')
print(result) # [<Element goods at 0x148d4f09340>, <Element goods at 0x148d4f09300>, <Element goods at 0x148d4f09380>]
# 4)获取节点内容
# 语法:获取节点的路径/text()
name = supermarket.xpath('./name/text()')
print(name)
names = supermarket.xpath('//name/text()')
print(names)
# 5)获取节点属性值
# 语法:获取节点的路径/@属性名
result = supermarket.xpath('./goods/@price')
print(result)
result = supermarket.xpath('//goods/@price')
print(result)
解析html文件
from lxml import etree
html = etree.HTML(open('test.html', encoding='utf-8').read())
h1 = html.xpath('/html/body/h1')
print(h1) # [<Element h1 at 0x1a031739140>]
h1 = html.xpath('./body/h1')
print(h1) # [<Element h1 at 0x1a031739140>]
h1 = html.xpath('//h1')
print(h1) # [<Element h1 at 0x1a031739140>]
# 1.加谓语(加条件)
# 语法:选中标签的路径[谓语]
# 1)[N] - 获取同层的第N个标签
p = html.xpath('//p[1]/text()')
print(p)
p = html.xpath('./body/p[1]/text()')
print(p)
p = html.xpath('./body/ul/li[2]/p/text()')
print(p)
# 2)
# [last()] - 获取同层的最有一个标签
# [last()-N] - 获取同层的倒数第(N+1)个
result = html.xpath('./body/ul/li/p[last()-1]/text()')
print(result)
result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)
# 3)
# [position()>N]
# [position()<N]
# [position()>=N]
# [position()<=N]
result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)
result = html.xpath('./body/ul/li[position()>2]/p/text()')
print(result)
# 4)[@属性名] - 获取指定属性的标签
result = html.xpath('./body/div/p[@class]/text()')
print(result)
result = html.xpath('./body/div/p[@id]/text()')
print(result)
# [@属性名=属性值] - 获取指定属性是指定值的标签
result = html.xpath('./body/div/p[@class="c1"]/text()')
print(result)
result = html.xpath('//p[@class="c1"]/text()')
print(result)
result = html.xpath('//p[@id="p1"]/text()')
print(result)
# 5)
# [标签 >/</>=/<=/= 数据] - 将标签按照指定子标签的内容进行筛选
result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)
result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)
result = html.xpath('./body/ul/li[p[1]="面包"]/p/text()')
print(result)
# 2.通配符:*
# 1)表示任意标签
result = html.xpath('./body/div[@id="div1"]/*')
print(result)
result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)
result = html.xpath('//*[@class="c1"]')
print(result)
# 2)表示任意属性
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)
result = html.xpath('./body/div[last()]/p[@*="p"]/text()')
print(result)
result = html.xpath('//img/@*')
print(result)
# 3.分支(获取若干个路径)
# 注意:一个|隔开的必须是两个独立的路径
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[3]/text()')
print(result)
爬某瓣
from selenium.webdriver import Chrome
import csv
from lxml import etree
b = Chrome()
b.get('https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0')
html = b.page_source
all_a = html.xpath('//div[@class="list"]/a')
all_data = []
for i in all_a:
img = i.xpath('./div/img/@src')[0]
name = i.xpath('./div/img/@alt')[0]
score = i.xpath('./p/strong/text()')[0]
all_data.append([name, score, img])
reader = csv.writer(open('files/豆瓣电影数据.csv', 'w', encoding='utf-8', newline=''))
reader.writerow(['电影名称', '评分', '图片链接'])
reader.writerows(all_data)