Python爬虫-bs4遍历文档树-bs4搜索文档树-css选择器

本文详细介绍了使用Python进行网络爬虫时,如何利用BeautifulSoup库遍历和搜索文档树,理解response属性,掌握请求函数的参数,并通过实例演示了CSS选择器的应用,以及如何利用这些技巧爬取汽车之家的新闻数据。
摘要由CSDN通过智能技术生成

response属性
import requests
url = "https://www.baidu.com/s"

resp = requests.get(url,params={"wd":"egon"},headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3538.77 Safari/537.36"})

# print(resp.text)
# print(resp.content)
# print(resp.status_code)
# print(resp.url) # 当前地址
# print(resp.cookies) #获取返回的cookies信息
# print(resp.cookies.get_dict()) #获取返回的cookies信息
# print(resp.json()) # 将结果进行反序列化
# print(resp.request) #请求方式
# print(resp.apparent_encoding) # 从文档中获取编码
# print(resp.encoding)
# print(resp.headers) # 查看响应头
print(resp.history) # 重定向历史


resp = requests.get(url,
                    params={"wd":"egon"},
                    headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3538.77 Safari/537.36"},
                    stream=True)
# 以流的方式读取原始数据,没有经过HTTP协议解析的数据,一般不常用
print(resp.raw.read(100))

# 当文档的编码方式response解码方式不同时,需要手动指定
resp2 = requests.get("http://www.autohome.com/news")
# 以文档声明的方式来解码
resp2.encoding = resp2.apparent_encoding
print(resp2.text)
请求函数的参数详解
import requests
# 给服务器传参数
requests.get('url',params={'key':'value'})

# post请求 data和json都能传参数
requests.post("url",data={"name":"jerry","pwd":"123"},json={"name":"jerry","pwd":"123"})
# data 拼接为:name=jerry&pwd=123
# json 直接序列化成字符串 {"name":"jerry","pwd":"123"}

# 超时时间 第一个表示连接超时时间,2表示响4应超时时间
requests.post("https://www.baidu.com",timeout=(2,2))

# 代理池,对于爬虫是相当重要的参数
ps = ["121.228.240.101:9999","121.228.240.101:9999","121.228.240.101:9999","121.228.240.101:9999"]
import random
# 使用代理服务器请求
resp = requests.post("http://news.baidu.com/?tn=news",proxies={"HTTP":random.choice(ps)})
with open('new_baidu.html','wb') as f:
    f.write(resp.content)

# 上传文件
f = open(r"D:\aa.png","rb")

# 接收一个字典,key是服务器用于提取文件的字段名,f是要上传的文件对象
resp = requests.post("http://httpbin.org/post",files={"img":f})
print(resp.status_code)
BeautifulSoup 遍历文档树
from bs4 import BeautifulSoup

# 要解析的文档内容
html_doc = """
<html>
<head><title>The Dormouse's story</title></head>
<body class="b a c">
<p class="story">
<ssss>hhhh</ssss>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 第一个参数为要解析的文档数据
soup = BeautifulSoup(html_doc,'lxml')
# tag = soup.body
# print(type(tag))
# print(tag.name)
# print(tag.text)
# print(tag.attrs)

# 使用点语法查找标签,只能找到第一个名字匹配的标签
# tag = soup.a
# print(tag.attrs.get('href'))

# 嵌套选择
# print(soup.p.a.text)

# 获取子节点
# print(list(soup.p.children))
# 返回一个迭代器
# for i in soup.head.children:
#     print(i)

# print(soup.p.contents)
# 返回一个列表
# for i in soup.head.contents:
#     print(i)

# 获取父标签
# print(soup.p.parent)

# 获取所有的父辈标签
# print(list(soup.p.parents))
# for i in soup.p.parents:
#     print(i.name)

# print(list(soup.p.descendants))
# 获取所有子孙标签,会把所有子孙全部拆出来 包括文本内容
# for i in soup.p.descendants:
#     print(i)

# 获取兄弟标签,文本也被当做是一个节点
# 下一个兄弟
# print(soup.a.next_sibling.next_sibling)
# 之后的兄弟们
# print(list(soup.a.next_siblings))

# 上一个兄弟
# print(soup.a.previous_sibling)
# 之前的兄弟们
# print(list(soup.a.previous_siblings))
BeautifulSoup 搜索文档树
from bs4 import BeautifulSoup
import re

# 要解析的文档内容
html_doc = """
<html>
<head><title>The Dormouse's story</title></head>
<body class="b a c">
<button/>
<abus/>
<p class="story">
<ssss>hhhh</ssss>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="1">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc,'lxml')
# 过滤器,find_all 查找所有匹配的标签
# 按照名字匹配 可以传一个名字或一个列表
# print(soup.find_all('a'))
# print(soup.find_all(['a','p']))

# 找id为link1 的a标签
# print(soup.find_all('a',attrs={'id':'link1'}))
# print(soup.find_all('a',attrs={'class':'sister'}))
# print(soup.find_all(name='a',id='link1'))

# 注意如果要按照条件为class来查找,需要使用class_ 因为class是关键字
# 多个类名加空格即可
# 只能找到类名完全匹配的如:<a class="sister brother">
# print(soup.find_all(name='a',class_='sister brother'))
# 只要类名带有sister就能找到
# print(soup.find_all(name='a',class_='sister'))
# 如果属性带有特殊符号 可以把条件装在attrs中
# print(soup.find_all(name='a',attrs={'data-a':'sister'}))

# 指定文本
# print(soup.find_all(name='a',text='Elsie'))

# 过滤器
# 标签名称中带有a字母的标签
# print(soup.find_all(name="a"))
# res =  re.compile('b')
# 正则匹配
# print(soup.find_all(name=res))

# 数组
# print(soup.find_all(name=['body','a']))

# True表示所有标签
# print(soup.find_all(True))
# 所有具备id属性的标签
# print(soup.find_all(id=True))

# 方法匹配(写个函数来过滤)
# 必须只能有一个参数,参数表示要过滤的标签
def MyFilter(tag):
    return tag.name == "a" and tag.text != "Elsie" and tag.has_attr("id")
print(soup.find_all(MyFilter,limit=1))

# 使用方式和find_all 相同
print(soup.find('a'))

# 总结: 过滤可以是数组,可以是一个 re,可以是一个函数,可以是True
CSS选择器
from bs4 import BeautifulSoup

# 要解析的文档内容
html_doc = """
<html>
<head><title>The Dormouse's story</title></head>
<body class="b a c">
<button/>
<abus/>
<ssss>hhhh</ssss>
<p class="story">
<ssss>xxxx</ssss>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="1">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc,'lxml')
# print(soup.select('a'))
# print(soup.select('.sister'))
# print(soup.select('#link1'))
# p标签下的子标签
print(soup.select("p>ssss"))
bs4 爬取汽车之家新闻
import requests
from bs4 import BeautifulSoup

url = "https://www.autohome.com.cn/news/{page}/"

# 过滤标签
def filter(tag):
    return tag.name =='li' and tag.has_attr("data-artidanchor")

# 获取新闻列表
def get_list_paget(url):
    print(url)
    resp = requests.get(url, headers={
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"})
    soup = BeautifulSoup(resp.text,'lxml')
    lis = soup.find_all(filter)
    for t in lis:
        print('https:'+t.a.attrs.get('href'))
        print('https:'+t.img.attrs.get('src'))
        print(t.h3.text)
        print(t.span.text)
        print(t.em.text)
        print(t.p.text)

get_list_paget(url.format(page=1))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值