1、pip install beautifulsoup4
解析html文件:
import requests
from bs4 import BeautifulSoup
# 获取网页源代码,也可以右键实现查询源代码
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.prettify())
2、beautifulsoup库的基本元素
import requests
from bs4 import BeautifulSoup
# 获取网页源代码,也可以右键实现查询源代码
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())
#页面标题
print(soup.title)
# 链接标签(第一个a标签内容)
tag = soup.a
print(tag)
print(soup.a.name)
print(soup.a.parent.name)
print(soup.a.parent.parent.name)
print(tag.attrs)
print(tag.attrs['class'])
print(tag.attrs['href'])
print(type(tag.attrs))
print(type(tag))
print(soup.a.string)
print(soup.p)
print(soup.p.string)
print(type(soup.p.string))
from bs4 import BeautifulSoup
newsoup = BeautifulSoup("<b><!--This is a comment--></b><p> This is not a comment</p>","html.parser")
print(newsoup.b.string)
print(type(newsoup.b.string))
print(newsoup.p.string)
print(type(newsoup.p.string))
3、基于bs4库的html内容遍历类型
标签树的下行遍历:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.head)
print(soup.head.contents)
print(soup.body.contents)
len(soup.body.contents)
print(soup.body.contents[0])
print(soup.body.contents[4])
# 遍历儿子节点
for child in soup.body.children:
print(child)
# 遍历子孙节点
for child in soup.body.children:
print(child)
标签树的上行遍历:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.title.parent)
print(soup.html.parent)
print(soup.parent)
# 标签数的上行遍历
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)
标签树的平行遍历:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.a.next_sibling)
print(soup.a.next_sibling.next_sibling)
print(soup.a.previous_sibling)
print(soup.a.previous_sibling.previous_sibling)
print(soup.a.parent)
# 遍历后续节点
for sibling in soup.a.next_sibling:
print(sibling)
for sibling in soup.a.previous_sibling:
print(sibling)
4、基于bs4库的html格式输出
import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.prettify())
print(soup.a.prettify())
编码问题:
5、信息标记的三种形式
<>+tag
有类型的键值对
无类型键值对
提取HTML中所有URL链接:
步骤:1 搜索所有<a>标签。2 解析<a>标签格式,提取href后的链接内容。
import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
for link in soup.find_all('a'):
print(link.get('href'))
基于bs4库的HTML内容查找方法:
find_all:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.find_all('a'))
print(soup.find_all(['a','b']))
for tag in soup.find_all(True):
print(tag.name)
for tag in soup.find_all(re.compile('b')):
print(tag.name)
print(soup.find_all('p','course'))
print(soup.find_all(id='link1'))
print(soup.find_all(id='link'))
print(soup.find_all(id=re.compile('link')))
print(soup.find_all('a',recursive=False))
print(soup.find_all(string="Basic Python"))
print(soup.find_all(string = re.compile("python")))
示例一:
中国大学排名定向爬虫
#CrawUnivRankingA.py
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[3].string])
def printUnivList(ulist, num):
# 10表示字符换宽度
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" # {3}表示format填充部分的第3个变量,中文填充。
print(tplt.format("排名","学校名称","总分",chr(12288)))
for i in range(num):
u=ulist[i]
print(tplt.format(u[0],u[1],u[2],chr(12288)))
def main():
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 40) # 20 univs
main()
format方法的格式:
中文对齐问题的解决:
采用中文字符的空格填充chr(12288)