crawler-CSDN博客

本文链接：https://blog.csdn.net/xjn___c_c/article/details/115320931

本文通过实战演示如何使用requests库获取文本和图片，以及BeautifulSoup解析工具解析网页内容，包括利用select和find方法获取特定元素，以及findAll获取所有标题。深入理解了爬虫的基本操作和内容提取策略。

摘要由CSDN通过智能技术生成

爬虫

文章目录

爬虫
1 获取文本
- requests库
2 获取图片二进制格式
3 解析与获取内容

1 获取文本

requests库

requests.get()方法

import requests

url = 'https://www.baidu.com/'
# 获取url地址数据
response = requests.get(url)
# 给文本编码
response.encoding = 'utf-8'
# 获取文本 .text
print(response.text)

2 获取图片二进制格式

import requests

# 爬取地址
url = 'https://www.baidu.com/img/PCtm_d9c8750bed0b3c7d089fa7d55720d6cf.png'

# 获取地址对象 发送请求
response_data = requests.get(url)
# 获取二进制码 .content
print(response_data.content)
# 以二进制格式存储
with open("figure1.png", 'wb') as f1:
    f1.write(response_data.content)

3 解析与获取内容

用 bs.select()

解析用到的库 bs4 lxml

import requests
from bs4 import BeautifulSoup

# 解析时出现中文编码错误 改变标准输出的默认编码
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="gb18030")

url = 'http://news.baidu.com/'
r = requests.get(url)
r.encoding = 'utf-8'

# BeautifulSoup(网页源码，解析器/解析方式)

# 创建解析器实例对象
# 解析器1 lxml
bs = BeautifulSoup(r.text, 'lxml')

# 解析器2 系统自带的 html.parser
bs = BeautifulSoup(r.text,'html.parser')

# print(bs.text)

# copy selector
# #pane-news > div > ul > li.hdline0 > strong > a
# id=pane-news下面的div下面的ul下面的class=hdline0的li下面的strong下面的a
#  #代表id  id时唯一的   strong代表加粗

# 解析具体内容 bs.select("   ")
v_title = bs.select("#pane-news > div > ul > li.hdline0 > strong > a")
# print(v_title)  # v_title格式为列表
# print(v_title[0])  # 取列表中元素 （去掉中括号）
print(v_title[0].text)  # 提取文本内容

用bs.find() 根据属性获取具体内容

import requests
from bs4 import BeautifulSoup

# http://www.cnstock.com/  要闻
# 补全网址
# http://news.cnstock.com/news/sns_yw/index.html

url = 'http://news.cnstock.com/news/sns_yw/index.html'
res = requests.get(url)
# res.text 抓取网页源码
# print(res.text)
bs = BeautifulSoup(res.text, 'lxml')

# bs中的find方法 用来寻找页面源码中对应属性的值 bs.find(标签,{属性：属性值})
# tag = bs.find('a', {"href": "https://news.cnstock.com/news,bwkx-202103-4679616.htm"}, )
# print(tag)

# id 唯一 class 不唯一
# 根据当时网页中的2cc6a8838f16c0490097a0ba456fdbe2（会变化）来获取数据
# tag = bs.find('li', {'id':'2cc6a8838f16c0490097a0ba456fdbe2'})
# tag_des = tag.find('p', {'class': 'des'})
# 简写
tag_des = bs.find('li', {'id': '2cc6a8838f16c0490097a0ba456fdbe2'}).find('p', {'class': 'des'})

print(tag_des)

练习bs.findAll（）获取页面中的全部标题

bs.findAll(标签，{属性：属性值})

import requests
from bs4 import BeautifulSoup
'''
获取所有标题   bs.findAll(标签，{属性：属性值})
'''

url = 'http://news.cnstock.com/news/sns_yw/index.html'
res = requests.get(url)
if res.status_code == 200:
    bs = BeautifulSoup(res.text, 'lxml')
    tag = bs.findAll('li', {'class': 'newslist'})
    # # 只针对class属性可以有下面的写法 其他不行！
    # tag = bs.findAll('li', class_='newslist'})

    # 标签中有文本  要提取的内容在文本中 按文本内容提取
    # for tmp in tag:
    #     ans = tmp.find('h2').find('a', {'target': '_blank'})
    #     print(ans)
    #     print(ans.text)

    # 标签中没有文本  要提取的内容在属性中 按属性名提取内容
    for tmp in tag:
        ans = tmp.find('a', {'target': '_blank'})
        # print(ans)
        # 获取属性值    .attrs['属性']
        print(ans.attrs['title'])