XPath(XML路径语言)
是一门在XML文档中查找信息的语言,可用来在XML文档中对元素和属性进行遍历。
W3School官方文档:http: //www.w3school.com.cn/xpath/index.asp
XPath开发工具
开源的的XPath表达式编辑工具:XMLQuire(XML格式文件可用)
Chrome插件XPath Helper
Firefox插件XPath Checker
XPath 语法
表达式 描述用法说明
nodename 选取此节点的所有子节点。xpath(‘span’)选取span元素的所有子节点
/ 从根节点选取。xpath(‘/div’)从根节点上选取div节点
// 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置。xpath(‘//div’)从当前节点选取含有div节点的标签
. 选取当前节点。xpath(‘./div’)选取当前节点下的div标签
.. 选取当前节点的父节点。 xpath(‘../’)回到上一级节点
@ 选取属性。xpath(“//div[@id=’1001’]”)获取div标签中,含有ID属性且值为1001的标签 </table>
XPath的常见用法大全
from lxml import etree
html = '''
<bookstore>
<book price="100" category="cooking">
<title lang="en">Everyday Italian</title>
<author>Giada De Laurentiis</author>
<year>2005</year>
<price>30.00</price>
</book>
<book category="children">
<title lang="en">Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
<book category="web">
<title category="web">XQuery Kick Start</title>
<author>James McGovern</author>
<author>Per Bothner</author>
<author>Kurt Cagle</author>
<author>James Linn</author>
<author>Vaidyanathan Nagarajan</author>
<year>2003</year>
<price>49.99</price>
</book>
<book category="web" cover="paperback">
<title>Learning XML</title>
<author>Erik T. Ray</author>
<year>2003</year>
<price>39.95</price>
</book>
</bookstore>
'''
html = etree.HTML(html) # 加载字符串
# html = etree.parse('temp.html') # 加载文件
#etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
#etree.tostring():输出修正后的结果,类型是bytes
# 构建xpath规则提取数据
# res = html.xpath('//bookstore/book/title/text()')
# res = html.xpath('//book/@cover | //book/@category')
# res = html.xpath('//bookstore/book[1]/price/text()')
# res = html.xpath('//bookstore/book[position()<2]') # 获取第一本书 postion就是索引,索引从1开始
# res = html.xpath('//title[@lang]') #
# res = html.xpath('//title[@lang="en"]/text()') #
# res = html.xpath('//bookstore/book[price>35.00]/title/text()') #
# res = html.xpath('//bookstore/*') #
# res = html.xpath('//bookstore//*') #
# res = html.xpath('//title[@*]') #
# res = html.xpath('//book/title | //book/price') #
#res = html.xpath('//*[@category="web"]')
print(res)
58房源案例1
from lxml import etree
import requests
base_url = 'http://bj.58.com/chuzu/?utm_source=market&spm=b-31580022738699-me-f-862.mingzhan&PGTID=0d100000-0000-17cd-3f99-94d590fc655b&ClickID=1'
response = requests.get(base_url)
html = response.text
html = etree.HTML(html)
# 找到所有房源li
li_list = html.xpath('//ul[@class="listUl"]/li')
for li in li_list:
# 从一个房源中提取具体信息
title = li.xpath('.//h2/a/text()')
if title:
title = title[0].strip()
else:
continue
square = li.xpath('.//p[1]/text()')[0].replace(' ','').replace('\xa0','')
print(title,square)
58房源案例2
from lxml import etree
import requests
import json
# 详情页请求
def get_detail(url,f):
response = requests.get(url)
html = response.text
html = etree.HTML(html)
# 获取响应状态
if 200 <= response.status_code <= 300:
try:
title = html.xpath('//h1/text()')[0]
price = html.xpath('//span[@class="c_ff552e"]/b/text()')[0]
margin = html.xpath('//span[@class="c_333"]/text()')
if margin:
margin = margin[0]
else:
margin = '无'
rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0]
house_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0]
direction = html.xpath('//ul[@class="f14"]/li[3]/span[2]/text()')[0]
host = html.xpath('//ul[@class="f14"]/li[4]/span[2]/a/text()')[0]
area = html.xpath('//ul[@class="f14"]/li[5]/span[2]/a/text()')
# ['昌平','立水桥']
area = ''.join(area)
addr = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0]
except Exception as e:
print(e)
print(url)
exit()
data = {
'title': title,
'price': price,
'margin': margin,
'rent_type': rent_type,
'house_type': house_type,
'direction': direction,
'host': host,
'area': area,
'addr': addr,
}
for key in data:
data[key] = data[key].replace(' ','').replace('\xa0','').strip()
# 保存信息到文件中
print(data['title'])
f.write(json.dumps(data,ensure_ascii=False) + '\n')
def getPage():
base_url = 'http://bj.58.com/chuzu/pn1/'
response = requests.get(base_url)
html = response.text
html = etree.HTML(html)
# 找到所有房源li
li_list = html.xpath('//ul[@class="listUl"]/li')
for li in li_list:
# 从一个房源中提取详情链接
detail_url = li.xpath('.//h2/a/@href')
if detail_url:
detail_url = detail_url[0]
else:
continue
# 发起详情页请求
get_detail(detail_url,f)
if __name__ == '__main__':
f = open('house.json','w', encoding='utf-8')
getPage()
f.close()
补充
from lxml import etree
html = '''
<div class="php_zuopin_fenlei" style="height:70px;">
<span style="line-height:60px;">> 按学科:</span>
<div style="margin-left:65px;color:gray;font-size:12px;">
<a href="http://www.itxdl.cn/html/php/phparticles/" title="网络培训">PHP</a>
<a href="http://www.itxdl.cn/html/java/javaarticles/" title="特服培训"/>Java</a>
<a href="http://www.itxdl.cn/html/ui/uiuearticles/" title="散打培训"/>UI</a>
<a href="http://www.itxdl.cn/html/h5/HTML5articles/" title="赛车培训"/>Html5</a>
<a href="http://www.itxdl.cn/html/linux/linuxartices/" title="Linux培训"/>Linux</a>
</div>
</div>
'''
# 查看解析以后的html 注意a标签的不规范
# result = etree.tostring(html)
# print(result.decode('utf-8'))
# 自定义Parser
html = etree.HTML(html,parser=etree.HTMLParser())
print(html.xpath('//div[@class="php_zuopin_fenlei"]//a/text()'))