xml文件:
# xml文件内容如下:
"""
<annotation>
<folder>VOC2007</folder>
<filename>000005.jpg</filename>
<source>
<database>The VOC2007 Database</database>
<annotation>PASCAL VOC2007</annotation>
<image>flickr</image>
<flickrid>325991873</flickrid>
</source>
<owner>
<flickrid>archintent louisville</flickrid>
<name>?</name>
</owner>
<size>
<width>500</width>
<height>375</height>
<depth>3</depth>
</size>
<segmented>0</segmented>
<object>
<name>chair</name>
<pose>Rear</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>263</xmin>
<ymin>211</ymin>
<xmax>324</xmax>
<ymax>339</ymax>
</bndbox>
</object>
<object>
<name>chair</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>165</xmin>
<ymin>264</ymin>
<xmax>253</xmax>
<ymax>372</ymax>
</bndbox>
</object>
<object>
<name>chair</name>
<pose>Unspecified</pose>
<truncated>1</truncated>
<difficult>1</difficult>
<bndbox>
<xmin>5</xmin>
<ymin>244</ymin>
<xmax>67</xmax>
<ymax>374</ymax>
</bndbox>
</object>
<object>
<name>chair</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>241</xmin>
<ymin>194</ymin>
<xmax>295</xmax>
<ymax>299</ymax>
</bndbox>
</object>
<object>
<name>chair</name>
<pose>Unspecified</pose>
<truncated>1</truncated>
<difficult>1</difficult>
<bndbox>
<xmin>277</xmin>
<ymin>186</ymin>
<xmax>312</xmax>
<ymax>220</ymax>
</bndbox>
</object>
</annotation>
"""
lxml处理xml文件代码:
from lxml import etree
# xmlPath = r'000020.xml' # 内容相对少
xmlPath = r'000005.xml' # 内容相对多
html = etree.parse(xmlPath, etree.HTMLParser())
# result = etree.tostring(html)
# print(result.decode('utf-8'))
# print(type(result)) # <class 'bytes'>
# print(result)
# objectsList = html.xpath('//annotation/object')
# print(objectsList)
namesList = html.xpath('/html/body/annotation/object/name/text()')
# print(namesList)
difficultsList = html.xpath('//annotation/object/difficult/text()')
# print(difficultsList)
xminsList = html.xpath('//annotation/object/bndbox/xmin/text()')
# print(xminsList)
yminsList = html.xpath('//annotation/object/bndbox/ymin/text()')
# print(yminsList)
xmaxsList = html.xpath('//annotation/object/bndbox/xmax/text()')
# print(xmaxsList)
ymaxsList = html.xpath('//annotation/object/bndbox/ymax/text()')
# print(ymaxsList)
for i in range(len(difficultsList)):
difficult = difficultsList[i]
classType = namesList[i]
xmin = int(xminsList[i])
ymin = int(yminsList[i])
xmax = int(xmaxsList[i])
ymax = int(ymaxsList[i])
info = \
"difficult:{0:^4}classType:{1:^10}xmin:{2:^6}ymin:{3:^6}xmax:{4:^6}ymax:{5:^6}".format(
difficult,classType,xmin,ymin,xmax,ymax)
print(info)
print("--"*10)
objectsList = html.xpath('//object')
obj = objectsList[0]
# print(obj) # <Element object at 0x1704a9ec808>
xmin = obj.xpath('bndbox/xmin/text()')
print(xmin)
print("*"*20)
data = obj.xpath('//bndbox')
# 注意这里语句 data = obj.xpath('//bndbox')
# 和语句 data = obj.xpath('.//bndbox') 作用不同,
# 前者相当于 data = html.xpath('//bndbox')
# 后者是针对档期那obj节点所有的子孙节点查找
print(data)
控制台输出结果:
Windows PowerShell
版权所有 (C) Microsoft Corporation。保留所有权利。
尝试新的跨平台 PowerShell https://aka.ms/pscore6
PS C:\Users\chenxuqi\Desktop\新建文件夹\test> & 'D:\Python\Python37\python.exe' 'c:\Users\chenxuqi\.vscode\extensions\ms-python.python-2020.11.358366026\pythonFiles\lib\python\debugpy\launcher' '55379' '--' 'c:\Users\chenxuqi\Desktop\新建文件夹\test\lxml库
处理xml文件 .py'
difficult: 0 classType: chair xmin: 263 ymin: 211 xmax: 324 ymax: 339
difficult: 0 classType: chair xmin: 165 ymin: 264 xmax: 253 ymax: 372
difficult: 1 classType: chair xmin: 5 ymin: 244 xmax: 67 ymax: 374
difficult: 0 classType: chair xmin: 241 ymin: 194 xmax: 295 ymax: 299
difficult: 1 classType: chair xmin: 277 ymin: 186 xmax: 312 ymax: 220
--------------------
['263']
********************
[<Element bndbox at 0x1cb4992da08>, <Element bndbox at 0x1cb4992da48>, <Element bndbox at 0x1cb4992da88>, <Element bndbox at 0x1cb4992dac8>, <Element bndbox at 0x1cb4992db08>]
PS C:\Users\chenxuqi\Desktop\新建文件夹\test>
参考链接: 崔庆才-python3网络爬虫开发实战