-
使用方式:
安装lxml(也可以不,使用python自带的
html.parser
)pip3 install lxml
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'lxml') # 速度较快 print(soup.prettify()) #缩进,换行,格式化后输出
-
元素类型:
print(type(soup)) # bs4.BeautifulSoup类型 print(type(soup.a)) # Tag类型. print(type(soup.a.string)) # bs4.element.NavigableString类型
-
查找:
# 获取title tag. print(soup.title) # <title>The Dormouse's story</title> # 从上往下查询首个对应Tag. soup.body soup.a soup.p soup.b # 查询当前Tag中的内容,以列表形式输出 #['\n', <p class="title"><b>The Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were #<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, #<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and #<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; #and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>, '\n'] soup.p.contents soup.p.children # 和contents一致,区别在于,这个只是个迭代器,用于for...in...
查找所有匹配的(其实就是找标签)
# 过滤器 # 1. 字符串 soup.find_all('^b') # 找出所有以b开头的标签 # 2. 正则表达式 soup.find_all('t') # 找出所有含有t的标签 # 3. 列表 soup.find_all(['a', 'b']) # 找出所有a和b的标签.(注意,非正则) # 4. 属性指定 soup.find_all(id="link2") soup.find_all(attrs={"data-foo": "value"}) # 适用于不存在的属性 # 5. 方法 def tag_with_href_and_end_with_lacie(tag): return tag.has_attr('href') and re.match(r"(.*lacie)", tag['href']) for tag in soup.find_all(tag_with_href): print(tag) # 上面的方法可以优化成下面的形式: def href_no_lacie(href): return href and not re.match(r"(.*lacie$)", href) suop.find_all(href= href_no_lacie) # 6. CSS soup.find_all(class=re.compile('itl')) soup.find_all('a', class='') # 7. 内容 soup.find_all('a', string="Tillie") #注意,如果直接find_all(string="Tillue"),搜索出来的是object,不是Tag # 8. 限制数量 soup.find_all(True, limit= 2)
BeatifulSoup使用引导,超精简!
最新推荐文章于 2024-09-14 19:55:48 发布