小何的爬虫笔记——xpath_xpath etree div-CSDN博客

本文链接：https://blog.csdn.net/HYS662020/article/details/109632308

xpath解析

'''
最常用最便捷搞笑的一种解析方式，通用性、
-xpath解析原理
    -实例化etree对象，并且需要将被解析的页面源码 数据加载到对象中
    -调用etree对象中的xpath结合着xpath表达式实现标签的定位和内容的捕获

'''

# 如何实例化etree对象
#     -将本地的html文档中源码数据加载到etree中：
#         etree.parase(filepath)
#     -可以将互联网上的源码数据加载到该对象中
#         etree.HTML('page_text')
#      -xpath('xpath表达式')
from lxml import etree
parser = etree.HTMLParser(encoding='utf-8') #html代码书写不规范，不符合xml解析器的使用规范
#实例化一个etree对象并且将被解析的源码加载到了实例化中
tree=etree.parse('1.html',parser=parser)
r=tree.xpath('/html/body/div')  #第一个/表示从根节点开始定位 之后的每个/表示一个层级
print(r)        #[<Element title at 0x1580ff5ec88>]  返回的是一个Element的对象
r=tree.xpath('/html//div')  # // 中表示多个层级  这个比上个r多
print(r)
r=tree.xpath('//div[@class="dzpzmain"]')  #定位到某个具体属性值//div[@class="dzpzmain"]
print(r)
r=tree.xpath('//div[@class="dzpzmain"]/div/ul/li[@class="columm"]')
print(r)
"""
取文本 取内容
- /text()获取的是标签的直系文本内容
- //text()获取的是标签的非直系文本内容（所有文本内容）

取属性 
/ @attrName         ==>img/src
"""
r=tree.xpath('//div[@class="dzpzmain"]//li/i/text()')
print(r)
r=tree.xpath('//div/a/@href')[0]
print(r)






''''
xpath案例讲解
爬取58二手房中的相关信息
'''
import requests
from lxml import etree
import matplotlib.pyplot as plt
import numpy as np
headers_firefox={'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
parser = etree.HTMLParser(encoding='utf-8') #html代码书写不规范，不符合xml解析器的使用规范
parame={

}
url='https://gz.58.com/ershoufang/'
page_text=requests.get(url=url,headers=headers_google).text

#数据解析
tree=etree.HTML(page_text)
name_li_list=tree.xpath('//ul[@class="house-list-wrap"]/li')

name_list=[]
for li in name_li_list:
    name=li.xpath('./div[@class="list-info"]/h2/a/text()')[0]  #一定要加上点.
    print(name)
    name_list.append(name)
price_li_list=tree.xpath('//ul[@class="house-list-wrap"]/li')
print(price_li_list)
price_list=[]

for li in price_li_list:
    price=li.xpath('./div[@class="price"]//b//text()')
    l='万元'
    print(price[0],l)
    price_li_list.append(price)

print(np.mean(price_list))

实战：重点_xpath爬取4k图片

import requests
from lxml import etree
import os
headers_firefox={'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
if not os.path.exists('./picLibs'):
    os.mkdir('./picLibs')

for i in range(0,100):
    url='http://pic.netbian.com/4kmeinv/index_'+str(int(i))+'.html'
    response=requests.get(url=url,headers=headers_google)
    # 可以手动设定响应数据编码格式？
    # response.encoding='utf-8'
    page_text=response.text
    # 数据解析：src的属性值 alt属性
    tree=etree.HTML(page_text)
    li_list=tree.xpath('//div[@class="slist"]/ul/li')
    #创建一个文件夹

    for li in li_list:
        img_src='http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
        img_name=li.xpath('./a/img/@alt')[0]+'.jpg'
        #较为通用处理中文的编码格式
        img_name=img_name.encode('iso-8859-1').decode('gbk')
        # print(img_name)     #此时名称出现乱码
        # print(img_src)

        #请求图片进行持久化存储
        img_data=requests.get(url=img_src,headers=headers_google).content
        img_path='picLibs/'+img_name
        with open(img_path,'wb') as fp:
            fp.write(img_data)
            print(img_name,'下载成功')

实战：重点_xpath解析出所有城市名称

import requests
from lxml import etree





url='https://www.aqistudy.cn/historydata/'
headers_firefox={'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
response=requests.get(url=url,headers=headers_google)
response.encoding='utf-8'
page_text=response.text

tree=etree.HTML(page_text)
all_hot_city_name=[]
hot_li_list=tree.xpath('//div[@class="bottom"]/ul/li')
# 解析到了热门城市的城市名称
for li in hot_li_list:
    hot_city_name=li.xpath('./a/text()')
    all_hot_city_name.append(hot_city_name)


# 解析全部城市名称
all_city_name_list=tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
all_city_name=[]
for li in all_city_name_list:
    city_name=li.xpath('./a/text()')[0]
    all_city_name.append(city_name)






# 用xpath一次将热门城市和其他城市全部解析出来a
url='https://www.aqistudy.cn/historydata/'
headers_firefox={'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
headers_google={
'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
response=requests.get(url=url,headers=headers_google)
response.encoding='utf-8'
page_text=response.text
tree=etree.HTML(page_text)
# 解析到热门城市和所有城市对应的a标签
#热门城市：div/ul/li/a        其他所有城市:div//ul/div[2]/li/a
a_list=tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li') #使用或字运算符|做标签！
china_city_names=[]
for a in a_list:
    a=a.xpath('./text()')[0]
    china_city_names.append(a)
print(len(china_city_names))  #394成功！！！