简介
Beautiful Soup 是一个可以从 HTML 或 XML 文件中提取数据的 Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup 会帮你节省数小时甚至数天的工作时间.
安装
$ pip install beautifulsoup4
BeautifulSoup 不仅支持 HTML 解析器,还支持一些第三方的解析器,如,lxml,XML,html5lib 但是需要安装相应的库。
$ pip install lxml
$ pip install html5lib
使用
示例一
__author__ = 'MrChen'
from bs4 import BeautifulSoup
#这是示例
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
#初始化,实例化一个BeautifulSoup对象,参数可以是一个字符串,也可以是一个打开的文件比如open('mydoc.html')
soup = BeautifulSoup(html_doc)
print(soup.title)
#输出:<title>The Dormouse's story</title>
print(soup.title.parent)
#输出:<head><title>The Dormouse's story</title></head>
print(soup.title.parent.parent)
#输出:
#<html><head><title>The Dormouse's story</title></head>
#<body>
#<p class="title"><b>The Dormouse's story</b></p>
#<p class="story">Once upon a time there were three little sisters; and their names were
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#and they lived at the bottom of a well.</p>
#<p class="story">...</p>
#</body></html>
print(soup.title.name)
#输出:title
print(soup.title.parent.name)
#输出:head
print(soup.title.parent.parent.name)
#输出:html
print(soup.p)
#输出:<p class="title"><b>The Dormouse's story</b></p>
print(soup.p['class'])
#输出:['title']
print(soup.a)
#输出:<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.find_all('a'))
#输出:
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print(soup.find(id = 'link3'))
#输出:<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
for link in soup.find_all('a'):
print(link.get('href'))
#输出:
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
print(soup.getText())
#输出:
# The Dormouse's story
#
# The Dormouse's story
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
# ...
print('all tags : <<<<<<')
for tag in soup.find_all(True):
print(tag.name)
#输出:
#html
#head
#title
#body
#p
#b
#p
#a
#a
#a
#p
#print(soup.findAll('ul', class_="sub-menu"))
#print(soup.findAll('ul',{"class":"sub-menu"}))
#print(soup.find_all('ul', class_="sub-menu"))
#print(soup.find_all('ul',{"class":"sub-menu"}))
示例二
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
import requests
html_doc = """
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
<title>首页 - 简书</title>
</head>
<body class="output fluid zh cn win reader-day-mode reader-font2 " data-js-module="recommendation" data-locale="zh-CN">
<ul class="article-list thumbnails">
<li class=have-img>
<a class="wrap-img" href="/p/49c4728c3ab2"><img src="http://upload-images.jianshu.io/upload_images/2442470-745c6471c6f8258c.jpg?imageMogr2/auto-orient/strip%7CimageView2/1/w/300/h/300" alt="300" /></a>
<div>
<p class="list-top">
<a class="author-name blue-link" target="_blank" href="/users/0af6b163b687">阿随向前冲</a>
<em>·</em>
<span class="time" data-shared-at="2016-07-27T07:03:54+08:00"></span>
</p>
<h4 class="title"><a target="_blank" href="/p/49c4728c3ab2"> 只装了这六款软件,工作就高效到有时间逛某宝刷某圈</a></h4>
<div class="list-footer">
<a target="_blank" href="/p/49c4728c3ab2">
阅读 1830
</a> <a target="_blank" href="/p/49c4728c3ab2#comments">
· 评论 35
</a> <span> · 喜欢 95</span>
<span> · 打赏 1</span>
</div>
</div>
</li>
</ul>
</body>
"""
soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
# 查找所有有关的节点
tags = soup.find_all('li', class_="have-img")
for tag in tags:
image = tag.img['src']
article_user = tag.p.a.get_text()
article_user_url = tag.p.a['href']
created = tag.p.span['data-shared-at']
article_url = tag.h4.a['href']
# 可以在查找的 tag 下继续使用 find_all()
tag_span = tag.div.div.find_all('span')
likes = tag_span[0].get_text(strip=True)
具体学习猛击官方文档
参考其他文章
https://www.jianshu.com/p/2b783f7914c6
https://blog.csdn.net/bruce_6/article/details/80764000