python爬虫数据解析--xpath解析详细案例讲解

weixin_45073393

已于 2023-08-04 10:54:32 修改

阅读量291

点赞数

分类专栏： python爬虫文章标签： python 爬虫开发语言 csdn gpt pycharm mysql

于 2023-08-04 10:51:06 首次发布

本文链接：https://blog.csdn.net/weixin_45073393/article/details/132099265

版权

python爬虫专栏收录该内容

1 篇文章

订阅专栏

一、实验原理

使用通用爬虫爬取网页数据
实例化etree对象，且将页面数据加载到该对象中
使用xpath函数结合xpath表达式进行标签定位和指定数据提取

1.1 etree对象实例化

本地文件：
tree = etree.parse(文件名)
tree.xpath(“xpath表达式”)
网络数据：
tree = etree.HTML(网页内容字符串)
tree.xpath(“xpath表达式”)

1.2 使用规范

/表示一个层级
//表示多个层级
//可以表示从任意位置开始定位
./表示从当前目录开始
xpath表达式获取捷径：F12打开开发者工具，找到HTML源代码，定位到标签，右击复制xpath表达式。

二、实战案例

2.1 爬取二手房文本标题

import requests
from lxml import etree

if __name__ == "__main__":
    url = 'https://bj.58.com/ershoufang/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'
    }
    # 获取html页面内容
    page_text = requests.get(url=url,headers=headers).text
    # print(page_text)
    # 页面内容xpath解析
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//*[@id="esfMain"]/section/section[3]/section[1]/section[2]//div[@class="property-content-title"]')
    fp = open('58.txt','w',encoding='utf-8')

    # 存储的是标签对象列表
    ll=len(li_list)
    # 遍历每个对象并解析
    for i in range(ll):
        # 局部解析
        title = li_list[i].xpath('./h3/text()')[0]
        fp.write(title+'\n')

2.2 爬取图片

# 实战爬取图片
import requests
from lxml import etree
import os

if __name__ == "__main__":
    url = 'http://pic.netbian.com/4Kyuanchuang/'
    # UA伪装
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
    }
    # 获取页面源代码
    response = requests.get(url=url, headers=headers)
    response.encoding = 'gbk'
    response_text = response.text

    # 获取局部页面
    tree = etree.HTML(response_text)
    li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
    print(li_list)

    # len
    ll = len(li_list)

    # 创建存储图片的文件夹
    if not os.path.exists('./piclibs/'):
        os.mkdir('./piclibs/')

    for i in range(ll):
        title = li_list[i].xpath("./a/img/@alt")[0][-10:] + '.jpg'
        img_src = li_list[i].xpath("./a/img/@src")[0]
        img_url = "http://pic.netbian.com" + img_src
        # 获取图片信息
        response_img = requests.get(url=img_url, headers=headers).content
        img_name = "./piclibs/" + title
        with open(img_name, 'wb') as fp:
            fp.write(response_img)
            print(title + "保存成功！！！")

2.3 爬取全国城市的名称

# 爬取全国城市
import requests
from lxml import etree

if __name__ == "__main__":
    url = 'https://www.aqistudy.cn/historydata/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'
    }
    # 获取html页面内容
    page_text = requests.get(url=url,headers=headers).text
    # print(page_text)
    # 页面内容xpath解析
    tree = etree.HTML(page_text)
    li_list = tree.xpath('/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li | /html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li')
    fp = open('airdata.txt','w',encoding='utf-8')

    # 存储的是标签对象列表
    ll=len(li_list)
    print(ll)
    city_list = []

    # 遍历每个对象并解析
    for i in range(ll):
        # 局部解析
        city_name = li_list[i].xpath('./a/text()')[0]
        print(city_name)
        city_list.append(city_name)

    fp.writelines(str(city_list))