Python 爬虫案例

最新推荐文章于 2024-08-26 18:56:03 发布

山河Z

最新推荐文章于 2024-08-26 18:56:03 发布

阅读量475

点赞数

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/m0_67332253/article/details/131707188

版权

beautifulsoup4解析数据

简介

1. 前面已经可以获取到数据了，但是数据中有大量的无用的数据，因此如果想使用数据的话，就需

要进行解析了

2. 其实解析方式有很多的，比如正则、bs4、xml等都是可以的，这里面re的速度是最快的，但是

这个对正则表达式的书写能力有一定的要求

3. bs4也可以做解析，而且一种最简单的实现方式。

通过bs4，可以将复杂的html文档，转化成一个树形结构，每个节点都是python对象。
bs4是一个第三方的库，可以执行命令进行安装：pip install beautifulsoup4

beautifulsoup4解析器

使用举例

测试字符串

# 待分析的字符串
html_doc = '''
<html>
    <head>
        <title>测试页面</title>
    </head>
<body>
        <p class="title aq">
            这是首页
        </p>
        <p class="story">
            你好北京
            <a href="http://www.baidu.com" class="sister" id="link1">百度</a>
            <a href="http://www.qq.com" class="sister" id="link2">QQ</a>
            and
            <a href="http://www.sina.com" class="sister" id="link3">新浪</a>
        </p>
        <p class="story">            
        ...
        </p>
    </body>
</html>
'''

输出第一个title标签的相关内容

from bs4 import BeautifulSoup
import bs4
# 待分析的字符串
html_doc = '''
<html>
    <head>
        <title>测试页面</title>
    </head>
    <body>
        <p class="title aq">
            这是首页
        </p>
        <p class="story">
            你好北京
            <a href="http://www.baidu.com" class="sister" id="link1">百度</a>
            <a href="http://www.qq.com" class="sister" id="link2">QQ</a>
            and
            <a href="http://www.sina.com" class="sister" id="link3">新浪</a>
        </p>
        <p class="story">
            ...
        </p>
    </body>
</html>
'''
# 创建 BeautifulSoup 对象
# 解析器为：html.parser
# from_encoding='utf-8'如果没有中文乱码，可以不指定
bs_obj = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')

# 输出内容
print(f"第一个title标签：{bs_obj.title}")
print(f"第一个title标签的内容：{bs_obj.title.text}")
print(f"第一个title标签的内容：{bs_obj.title.string}")
print(f"第一个title标签的名字：{bs_obj.title.name}")
print(f"第一个title标签的父标签的名字：{bs_obj.title.parent.name}")
print(f"第一个title标签的子标签的名字：{bs_obj.title.children.name}")

输出第一个p标签的内容

from bs4 import BeautifulSoup
import bs4
# 待分析的字符串
html_doc = '''
<html>
    <head>
        <title>测试页面</title>
    </head>
    <body>
        <p class="title aq", id=5>
            这是首页
        </p>
        <p class="story">
            你好北京
            <a href="http://www.baidu.com" class="sister" id="link1">百度</a>
            <a href="http://www.qq.com" class="sister" id="link2">QQ</a>
            and
            <a href="http://www.sina.com" class="sister" id="link3">新浪</a>
        </p>
        <p class="story">
            ...
        </p>
    </body>
</html>
'''
# 创建 BeautifulSoup 对象
# 解析器为：html.parser
bs_obj = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
# 输出内容
print(f"第一个p标签：{bs_obj.p}")
print(f"第一个p标签的内容：{bs_obj.p.text}")
print(f"第一个p标签的class属性：{bs_obj.p['class']}")

对第一个p的字标签的进行遍历输出

from bs4 import BeautifulSoup
import bs4
# 待分析的字符串
html_doc = '''
<html>
    <head>
        <title>测试页面</title>
    </head>
    <body>
        <p class="title aq", id=5>
            这是首页
    </p>
        <p class="story">
            你好北京
            <a href="http://www.baidu.com" class="sister" id="link1">百度</a>
            <a href="http://www.qq.com" class="sister" id="link2">QQ</a>
            and
            <a href="http://www.sina.com" class="sister" id="link3">新浪</a>
        </p>
        <p class="story">
            ...
        </p>
    </body>
</html>
'''
# 创建 BeautifulSoup 对象
# 解析器为：html.parser
bs_obj = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
n = 1
# 对p标签的子节点进行遍历
for child in bs_obj.p.children:
# child中保存的就是子标签
# 可以用child.name 获取子标签的名字 child.text 获取子标签的内容
print(f"第 {n} 个子节点是 {child}")

案例：用google去搜索 inurl.php?id=1 公司

import requests         # pip install requests -i

https://pypi.douban.com/simple
from requests.packages.urllib3.exceptions import InsecureRequestWarning

import urllib                 # 这个模块是用于实现url解码

from urllib import parse      # 这个模块是用于实现url解码

from bs4 import BeautifulSoup # pip install bs4 -i https://pypi.douban.com/simple

class SearchByGoogle(object):
    def __init__(self,url,header,data):
        self.url = url
        self.header = header
        self.data = data
        # 这个是用于解决 爬虫requests.exceptions.SSLError:HTTPSConnectionPool(host='XXX', port=443)问题
        # 在google刚开始进行请求是正常的，但是请求次数多了以后，就会禁止请求了，可以通过这种方法进行解决


        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

    def searchMsg(self):
        # 发送请求

        resp = requests.get(url=self.url,headers=self.header,params=self.data)

        # 设置字符集为utf8,【如果没有出现中文乱码，可以省略】
        resp.encoding = "utf-8"

        # 接收返回结果
        resp_text = resp.text

        # print(resp_text)

        # 对返回结果进行url解码
        self.resp_decode = urllib.parse.unquote(resp_text)

    def getDestUlr(self):
        # 创建bs对象
        data = BeautifulSoup(self.resp_decode)

        # 定义a标签，所有符合条件的a标签，会返回一个列表
        atarA_list = data.select("div[class='P8ujBc v5yQqb jqWpsc'] > a")
        # a = soup.select("div[class=yuRUbf]>a")
        # print(a)

        # 遍历列表，获取href中的值
        for item in atarA_list:
            print(item['href']) # 获取a标签中的href属性的值


if __name__ == '__main__':
    # google搜索的url地址
    url = "http://144.24.32.107/search"
    # 设置请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36',
    }


    # 设置请求参数,数据来源于payload
    data = {
        'q': 'inurl: .php?id=1 公司',
        'ei': 'tKhVZIaBK9uxqtsPrPesOA',
        'ved': '0ahUKEwiGq_3rwN_-AhXbmGoFHaw7CwcQ4dUDCBA',
        'oq': 'inurl: login.php 公司',
        'gs_lcp': 'Cgxnd3Mtd2l6LXNlcnAQDEoECEEYAFAAWABgAGgAcAB4AIABAIgBAJIBAJgBAA',
        'sclient': 'gws-wiz-serp'
      }



    # 实例化类
    obj = SearchByGoogle(url,headers,data)

    # 进行搜索
    obj.searchMsg()

    # 解析数据，并输出目标url
    obj.getDestUlr()

山河Z

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
Python 爬虫案例

2. 其实解析方式有很多的，比如正则、bs4、xml等都是可以的，这里面re的速度是最快的，但是。1. 前面已经可以获取到数据了，但是数据中有大量的无用的数据，因此如果想使用数据的话，就需。通过bs4，可以将复杂的html文档，转化成一个树形结构，每个节点都是python对象。bs4是一个第三方的库，可以执行命令进行安装：pip install。案例：用google去搜索 inurl.php?3. bs4也可以做解析，而且一种最简单的实现方式。这个对正则表达式的书写能力有一定的要求。
复制链接

扫一扫