目录
爬虫四种解析:re 、xpath 、bs4、parsel
导入模块
from bs4 import BeautifulSoup
源码
html = '''<html>
<head>
<title>The Dormouse's story </title>
</head>
<body>
<p class="title">The Dormouse's story</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body>
</html>'''
源码转换为BeautifulSoup对象
#html源码,lxml解析器
soup = BeautifulSoup(html, 'lxml')
获取title标签
print(soup.title)
find 查找符合规则的第一个内容
print(soup.find('p'))
find_all 查找所有符合规则的内容
#如果内容里有标签也被提取
print(soup.find_all('p'))
string/get_text获取标签内容
仅find方法可提取文本信息
#只有find方法能提取文本信息
print(soup.find('p').string)
print(type(soup.find('p').string)) #返回bs4对象
print(soup.find('p').get_text())
print(type(soup.find('p').get_text())) #返回str类型
获取带有属性的标签
print(soup.find_all('p', attrs={'class': 'story'}))
print(soup.find_all('p', class_='story'))
css选择器
print(soup.select('a')) #返回所有a标签
美化
print(soup.prettify())
爬取番组计划排行榜案例
import json
from bs4 import BeautifulSoup
import requests
url ='https://bgm.tv/anime/browser?sort=rank'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}
req = requests.get(url,headers=headers)
req.encoding='utf-8'
#源码转换为bs4对象
html = BeautifulSoup(req.text,'lxml')
# 获取ul标签
list_ul = html.find('ul',attrs={'id':'browserItemList'})
# 获取ul标签中所有li标签
for data in list_ul.find_all('li'):
# 中文名
title = data.find('h3').find('a').get_text()
# 外文名
try:
foreign_title = data.find('h3').find('small').get_text()
except:
foreign_title = ""
message = data.find('p').get_text().strip().split("/")
# 年份
year = message[1]
#集数(话)
hua = message[0]
# 作者
distortionist = message[2:]
#评分
score = data.find('div',class_='inner').find('p',class_='rateInfo').find('small',class_='fade').get_text()
#存入字典
cartoon_dict = {
'中文名': title,
'外文名': foreign_title,
'年份': year,
'话': hua,
'作家': distortionist,
'评分': score,
}
# json格式写入text
with open('./番组计划排行榜.txt','a',encoding='utf8') as f1:
f1.write(json.dumps(cartoon_dict,ensure_ascii=False,indent=4)+'\n')
2-3个中文字符:[^\u4E00-\u9FA5]{2,3}