网页抓取
from bs4 import BeautifulSoup
import requests as req
resp = req.get("http://www.baidu.com")
soup = BeautifulSoup(resp.text, "lxml-xml")
美化代码
使用prettify()
方法
from bs4 import BeautifulSoup
import requests as req
resp = req.get("http://www.baidu.com")
soup = BeautifulSoup(resp.text, "lxml-xml")
print(soup.prettify())
通过 ID 查找元素
使用find()
方法
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
#print(soup.find("ul", attrs={ "id" : "mylist"}))
print(soup.find("ul", id="mylist"))
查找具有mylist
ID 的ul
标签
通过各种方式(包括元素 ID)查找元素
查找所有标签
使用find_all()
方法
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
for tag in soup.find_all("li"):
print("{0}: {1}".format(tag.name, tag.text))
可以获取要搜索的元素列表
tags = soup.find_all(['h2', 'p'])
for tag in tags:
print(" ".join(tag.text.split()))
find_all()
方法还可以使用一个函数,该函数确定应返回哪些元素
#!/usr/bin/python3
from bs4 import BeautifulSoup
def myfun(tag):
return tag.is_empty_element
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
tags = soup.find_all(myfun)
print(tags)
使用正则表达式查找元素
import re
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
strings = soup.find_all(string=re.compile('BSD'))
for txt in strings:
print(" ".join(txt.split()))
CSS 选择器
通过select()
和select_one()
方法,CSS 选择器来查找元素
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
print(soup.select("li:nth-of-type(3)"))
CSS 中使用# 字符通过 ID 属性选择标签
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
print(soup.select_one("#mylist"))
追加元素
append()
方法将新标签附加到 HTML 文档
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
# 附加了一个新的li标签 new_tag()方法创建一个新标签 ul标签的引用
newtag = soup.new_tag('li')
newtag.string='OpenBSD'
ultag = soup.ul
# 将新创建的标签附加到ul标签
ultag.append(newtag)
print(ultag.prettify())
使用new_tag()
方法创建一个新标签
插入元素
insert()
方法在指定位置插入标签
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
newtag = soup.new_tag('li')
newtag.string='OpenBSD'
ultag = soup.ul
ultag.insert(2, newtag)
print(ultag.prettify())
替换文字
replace_with()
替换元素的文本
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
tag = soup.find(text="Windows")
tag.replace_with("OpenBSD")
print(soup.ul.prettify())
删除元素
decompose()
方法从树中删除标签并销毁它
from bs4 import BeautifulSoup
with open("index.html", "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
ptag2 = soup.select_one("p:nth-of-type(2)")
ptag2.decompose()
print(soup.body.prettify())