xpath解析
'''
最常用最便捷搞笑的一种解析方式,通用性、
-xpath解析原理
-实例化etree对象,并且需要将被解析的页面源码 数据加载到对象中
-调用etree对象中的xpath结合着xpath表达式实现标签的定位和内容的捕获
'''
# 如何实例化etree对象
# -将本地的html文档中源码数据加载到etree中:
# etree.parase(filepath)
# -可以将互联网上的源码数据加载到该对象中
# etree.HTML('page_text')
# -xpath('xpath表达式')
from lxml import etree
parser = etree.HTMLParser(encoding='utf-8') #html代码书写不规范,不符合xml解析器的使用规范
#实例化一个etree对象并且将被解析的源码加载到了实例化中
tree=etree.parse('1.html',parser=parser)
r=tree.xpath('/html/body/div') #第一个/表示从根节点开始定位 之后的每个/表示一个层级
print(r) #[<Element title at 0x1580ff5ec88>] 返回的是一个Element的对象
r=tree.xpath('/html//div') # // 中表示多个层级 这个比上个r多
print(r)
r=tree.xpath('//div[@class="dzpzmain"]') #定位到某个具体属性值//div[@class="dzpzmain"]
print(r)
r=tree.xpath('//div[@class="dzpzmain"]/div/ul/li[@class="columm"]')
print(r)
"""
取文本 取内容
- /text()获取的是标签的直系文本内容
- //text()获取的是标签的非直系文本内容(所有文本内容)
取属性
/ @attrName ==>img/src
"""
r=tree.xpath('//div[@class="dzpzmain"]//li/i/text()')
print(r)
r=tree.xpath('//div/a/@href')[0]
print(r)
''''
xpath案例讲解
爬取58二手房中的相关信息
'''
import requests
from lxml import etree
import matplotlib.pyplot as plt
import numpy as np
headers_firefox={'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
parser = etree.HTMLParser(encoding='utf-8') #html代码书写不规范,不符合xml解析器的使用规范
parame={
}
url='https://gz.58.com/ershoufang/'
page_text=requests.get(url=url,headers=headers_google).text
#数据解析
tree=etree.HTML(page_text)
name_li_list=tree.xpath('//ul[@class="house-list-wrap"]/li')
name_list=[]
for li in name_li_list:
name=li.xpath('./div[@class="list-info"]/h2/a/text()')[0] #一定要加上点.
print(name)
name_list.append(name)
price_li_list=tree.xpath('//ul[@class="house-list-wrap"]/li')
print(price_li_list)
price_list=[]
for li in price_li_list:
price=li.xpath('./div[@class="price"]//b//text()')
l='万元'
print(price[0],l)
price_li_list.append(price)
print(np.mean(price_list))
实战:重点_xpath爬取4k图片
import requests
from lxml import etree
import os
headers_firefox={'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
for i in range(0,100):
url='http://pic.netbian.com/4kmeinv/index_'+str(int(i))+'.html'
response=requests.get(url=url,headers=headers_google)
# 可以手动设定响应数据编码格式?
# response.encoding='utf-8'
page_text=response.text
# 数据解析:src的属性值 alt属性
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
#创建一个文件夹
for li in li_list:
img_src='http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
img_name=li.xpath('./a/img/@alt')[0]+'.jpg'
#较为通用处理中文的编码格式
img_name=img_name.encode('iso-8859-1').decode('gbk')
# print(img_name) #此时名称出现乱码
# print(img_src)
#请求图片进行持久化存储
img_data=requests.get(url=img_src,headers=headers_google).content
img_path='picLibs/'+img_name
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name,'下载成功')
实战:重点_xpath解析出所有城市名称
import requests
from lxml import etree
url='https://www.aqistudy.cn/historydata/'
headers_firefox={'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
response=requests.get(url=url,headers=headers_google)
response.encoding='utf-8'
page_text=response.text
tree=etree.HTML(page_text)
all_hot_city_name=[]
hot_li_list=tree.xpath('//div[@class="bottom"]/ul/li')
# 解析到了热门城市的城市名称
for li in hot_li_list:
hot_city_name=li.xpath('./a/text()')
all_hot_city_name.append(hot_city_name)
# 解析全部城市名称
all_city_name_list=tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
all_city_name=[]
for li in all_city_name_list:
city_name=li.xpath('./a/text()')[0]
all_city_name.append(city_name)
# 用xpath一次将热门城市和其他城市全部解析出来a
url='https://www.aqistudy.cn/historydata/'
headers_firefox={'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
response=requests.get(url=url,headers=headers_google)
response.encoding='utf-8'
page_text=response.text
tree=etree.HTML(page_text)
# 解析到热门城市和所有城市对应的a标签
#热门城市:div/ul/li/a 其他所有城市:div//ul/div[2]/li/a
a_list=tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li') #使用或字运算符|做标签!
china_city_names=[]
for a in a_list:
a=a.xpath('./text()')[0]
china_city_names.append(a)
print(len(china_city_names)) #394成功!!!