from bs4 import BeautifulSoup
# 要解析的文档内容
html_doc ="""
<html>
<head><title>The Dormouse's story</title></head>
<body class="b a c">
<p class="story">
<ssss>hhhh</ssss>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""# 第一个参数为要解析的文档数据
soup = BeautifulSoup(html_doc,'lxml')# tag = soup.body# print(type(tag))# print(tag.name)# print(tag.text)# print(tag.attrs)# 使用点语法查找标签,只能找到第一个名字匹配的标签# tag = soup.a# print(tag.attrs.get('href'))# 嵌套选择# print(soup.p.a.text)# 获取子节点# print(list(soup.p.children))# 返回一个迭代器# for i in soup.head.children:# print(i)# print(soup.p.contents)# 返回一个列表# for i in soup.head.contents:# print(i)# 获取父标签# print(soup.p.parent)# 获取所有的父辈标签# print(list(soup.p.parents))# for i in soup.p.parents:# print(i.name)# print(list(soup.p.descendants))# 获取所有子孙标签,会把所有子孙全部拆出来 包括文本内容# for i in soup.p.descendants:# print(i)# 获取兄弟标签,文本也被当做是一个节点# 下一个兄弟# print(soup.a.next_sibling.next_sibling)# 之后的兄弟们# print(list(soup.a.next_siblings))# 上一个兄弟# print(soup.a.previous_sibling)# 之前的兄弟们# print(list(soup.a.previous_siblings))
BeautifulSoup 搜索文档树
from bs4 import BeautifulSoup
import re
# 要解析的文档内容
html_doc ="""
<html>
<head><title>The Dormouse's story</title></head>
<body class="b a c">
<button/>
<abus/>
<p class="story">
<ssss>hhhh</ssss>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="1">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')# 过滤器,find_all 查找所有匹配的标签# 按照名字匹配 可以传一个名字或一个列表# print(soup.find_all('a'))# print(soup.find_all(['a','p']))# 找id为link1 的a标签# print(soup.find_all('a',attrs={'id':'link1'}))# print(soup.find_all('a',attrs={'class':'sister'}))# print(soup.find_all(name='a',id='link1'))# 注意如果要按照条件为class来查找,需要使用class_ 因为class是关键字# 多个类名加空格即可# 只能找到类名完全匹配的如:<a class="sister brother"># print(soup.find_all(name='a',class_='sister brother'))# 只要类名带有sister就能找到# print(soup.find_all(name='a',class_='sister'))# 如果属性带有特殊符号 可以把条件装在attrs中# print(soup.find_all(name='a',attrs={'data-a':'sister'}))# 指定文本# print(soup.find_all(name='a',text='Elsie'))# 过滤器# 标签名称中带有a字母的标签# print(soup.find_all(name="a"))# res = re.compile('b')# 正则匹配# print(soup.find_all(name=res))# 数组# print(soup.find_all(name=['body','a']))# True表示所有标签# print(soup.find_all(True))# 所有具备id属性的标签# print(soup.find_all(id=True))# 方法匹配(写个函数来过滤)# 必须只能有一个参数,参数表示要过滤的标签defMyFilter(tag):return tag.name =="a"and tag.text !="Elsie"and tag.has_attr("id")print(soup.find_all(MyFilter,limit=1))# 使用方式和find_all 相同print(soup.find('a'))# 总结: 过滤可以是数组,可以是一个 re,可以是一个函数,可以是True
CSS选择器
from bs4 import BeautifulSoup
# 要解析的文档内容
html_doc ="""
<html>
<head><title>The Dormouse's story</title></head>
<body class="b a c">
<button/>
<abus/>
<ssss>hhhh</ssss>
<p class="story">
<ssss>xxxx</ssss>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="1">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')# print(soup.select('a'))# print(soup.select('.sister'))# print(soup.select('#link1'))# p标签下的子标签print(soup.select("p>ssss"))
bs4 爬取汽车之家新闻
import requests
from bs4 import BeautifulSoup
url ="https://www.autohome.com.cn/news/{page}/"# 过滤标签deffilter(tag):return tag.name =='li'and tag.has_attr("data-artidanchor")# 获取新闻列表defget_list_paget(url):print(url)
resp = requests.get(url, headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"})
soup = BeautifulSoup(resp.text,'lxml')
lis = soup.find_all(filter)for t in lis:print('https:'+t.a.attrs.get('href'))print('https:'+t.img.attrs.get('src'))print(t.h3.text)print(t.span.text)print(t.em.text)print(t.p.text)
get_list_paget(url.format(page=1))