【Python第四课】BS4数据解析

最新推荐文章于 2023-09-27 17:41:19 发布

云先森(iyunsir)

最新推荐文章于 2023-09-27 17:41:19 发布

阅读量321

点赞数

分类专栏： Python系列文章标签： python

本文链接：https://blog.csdn.net/iyunsir/article/details/108341463

版权

Python系列专栏收录该内容

10 篇文章 0 订阅

订阅专栏

在这里插入图片描述

1.bs4的安装与三种使用方式

# 安装后需要在bs4中导入使用
from bs4 import BeautifulSoup

# 定义html文档内容
html_doc = """
<html><head><title abc="123">The Dormouse's story</title></head> <body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 创建一个 BeautifulSoup 对象，建议手动指定解析器: 
soup = BeautifulSoup(html_doc, 'lxml')

# 1. 通过tag标签对象获取文档数据
# r = soup.title
# r = soup.title['abc']
# r = soup.p
# r = soup.p['class']
# r = soup.title.text
# r = soup.p.parent.name

# print(r)

# 2. 通过搜索获取页面中的元素 find，find_all
# r = soup.find('a')
# r = soup.find_all('a')

# r = soup.find('title')
# # print(r,type(r))
# print(r.text)
# print(r.get_text())

# print(r)


# 3.css选择器

# 通过标签 选择元素
r = soup.select('title')

# 通过class类名获取元素
r = soup.select('.title')

# 通过ID名获取元素
r = soup.select('#link2')

# 通过空格 层级关系获取元素
r = soup.select('html body p')

# 通过逗号，并列关系获取元素
r = soup.select('a,title')


# print(r)

2.bs4实战-学习猿地-猿圈

# 。学习猿地-猿圈

'''
分析爬取的数据
数据源地址： https://www.lmonkey.com/t
数据内容： 文章标题，文章的链接，作者，发布时间
工具：
    python，requests,bs4，json
'''

import requests,json
from bs4 import BeautifulSoup

# 1。定义请求的URL和请求头
url = 'https://www.lmonkey.com/t'
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}

# 2。发送请求
res = requests.get(url,headers=headers)

# 3。判断请求是否成功，并获取请求的源代码
if res.status_code == 200:

    # 4。解析数据
    soup = BeautifulSoup(res.text,'lxml')
    # 获取页面中所有的文章
    divs = soup.find_all('div',class_="list-group-item list-group-item-action p-06")
    varlist = []
    for i in divs:
        r = i.find('div',class_="topic_title")
        if r:
            vardict = {
                'title':r.text.split('\n')[0],
                'url':i.a['href'],
                'author':i.strong.a.text,
                'pubdate':i.span['title']
            }
            varlist.append(vardict)
    # print(varlist)
    # 5。写入数据
    with open('./yq.json','w') as fp:
        json.dump(varlist,fp)

3.bs4-实战猿圈-代码优化

# 。学习猿地-猿圈

'''
分析爬取的数据
数据源地址： https://www.lmonkey.com/t
数据内容： 文章标题，文章的链接，作者，发布时间
工具：
    python，requests,bs4，json
'''

import requests,json
from bs4 import BeautifulSoup



# 封装类
class Bs4Yq():
    # 定义属性
    # 请求的url
    url = 'https://www.lmonkey.com/t'
    # 请求头
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
    }

    # 响应源代码的存放位置
    res_html = None

    # 存储解析后的数据
    varlist = []

    # 初始化方法
    def __init__(self):
        # 发起一个请求
        res = requests.get(self.url,headers = self.headers)
        if res.status_code == 200:
            self.res_html = res.text
            if self.ParseData():
                self.WriteJson()
                print('请求成功，数据写入文件')
        else:
            print('请求失败')

    # 解析html数据
    def ParseData(self):
        soup = BeautifulSoup(self.res_html, 'lxml')
        try:
            # 获取页面中所有的文章
            divs = soup.find_all('div', class_="list-group-item list-group-item-action p-06")
            for i in divs:
                r = i.find('div', class_="topic_title")
                if r:
                    vardict = {
                        'title': r.text.split('\n')[0],
                        'url': i.a['href'],
                        'author': i.strong.a.text,
                        'pubdate': i.span['title']
                    }
                    self.varlist.append(vardict)
            return True
        except:
            return False

    # 写入json数据
    def WriteJson(self):
        if self.varlist != []:
            try:
                with open('./yq.json', 'w') as fp:
                    json.dump(self.varlist, fp)
                return True
            except:
                return False
        else:
            print('无法获取当前解析的数据')
            return False



Bs4Yq()

云先森(iyunsir)

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【Python第四课】BS4数据解析

1.bs4的安装与三种使用方式# 安装后需要在bs4中导入使用from bs4 import BeautifulSoup# 定义html文档内容html_doc = """<html><head><title abc="123">The Dormouse's story</title></head> <body><p class="title"><b>The Dormouse's story&...
复制链接

扫一扫

专栏目录