1 importre2
3 from bs4 importBeautifulSoup, Comment4
5 html_doc = """
The Dormouse's story6 7The Dormouse's story
89
Once upon a time there were three little sisters; and their names were10 Elsie,11 Lacie and12 Tillie;13 and they lived at the bottom of a well.
1415
...
16 """17 #或者打开某个html文件 soup = BeautifulSoup(open("index.html"))
18 soup = BeautifulSoup(html_doc, "html.parser")19
20 tag =soup.p21 #获取标签的类型
22 print(type(tag)) #
23 #获取标签的名字
24 print(tag.name) #p
25 #获取标签的class属性的值
26 print(tag['class']) #['title']
27 #获取标签的所有属性
28 print(tag.attrs) #{'class': ['title']}
29
30 css_soup = BeautifulSoup('
', "html.parser")31 #获取多值属性32 print(css_soup.p['class']) #['body', 'strikeout']
33 #获取标签内容
34 print(tag.string) #The Dormouse's story
35 #获取标签内容的类型,字符串用NavigableString来包装
36 print(type(tag.string)) #
37
38 #将标签的内容替换
39 tag.string.replace_with("No longer bold")40 print(tag) #
No longer bold
41
42 #BeautifulSoup 对象表示的是一个文档的全部内容
43 print(soup.name) #[document]
44
45 markup = ""
46 soup = BeautifulSoup(markup, "html.parser")47 comment =soup.b.string48 #获取注释的类型Comment,Comment 对象是一个特殊类型的 NavigableString 对象
49 print(type(comment)) #
50 #以漂亮的格式输出
51 #
52 #
53 #
54 print(soup.b.prettify())55
56 #遍历文档树
57 soup = BeautifulSoup(html_doc, "html.parser")58 #直接获取head节点
59 print(soup.head) #
The Dormouse's story60 #直接获取title节点
61 print(soup.title) #
The Dormouse's story62 #获取body元素内的第一个b标签
63 print(soup.body.b) #The Dormouse's story
64 #获取文档内的第一个a标签
65 print(soup.a) #Elsie
66 #获取文档内的所有a标签
67 #[Elsie,
68 #href="http://example.com/lacie" id="link2">Lacie
, ]70 print(soup.find_all('a'))71 #通过.contents 获取head标签的所有子节点,以列表形式返回
72 print(soup.head.contents) #[
The Dormouse's story]73 #通过.children获取head的所有子节点
74 #结果:
The Dormouse's story75 for child insoup.head.children:76 print(child)77 #通过.descendants获取head的所有子孙节点
78 #结果:
79 #
The Dormouse's story80 #The Dormouse's story
81 for child insoup.head.descendants:82 print(child)83 #获取整个文档的子节点也就是一个html节点
84 print(len(list(soup.children))) #1
85 #获取整个文档的所有子孙节点的数量
86 print(len(list(soup.descendants))) #26
87 #获取文档内的所有的字符串
88 #结果:
89 #"The Dormouse's story"
90 #'\n'
91 #'\n'
92 #"The Dormouse's story"
93 #'\n'
94 #'Once upon a time there were three little sisters; and their names were\n'
95 #'Elsie'
96 #',\n'
97 #'Lacie'
98 #' and\n'
99 #'Tillie'
100 #';\nand they lived at the bottom of a well.'
101 #'\n'
102 #'...'
103 #'\n'
104 for string insoup.strings:105 print(repr(string))106 #获取文档内的所有字符串,去除多余的空白
107 #结果:
108 #"The Dormouse's story"
109 #"The Dormouse's story"
110 #'Once upon a time there were three little sisters; and their names were'
111 #'Elsie'
112 #','
113 #'Lacie'
114 #'and'
115 #'Tillie'
116 #';\nand they lived at the bottom of a well.'
117 #'...'
118 for string insoup.stripped_strings:119 print(repr(string))120
121 title_tag =soup.title122 #通过.parent 查找title节点的父节点
123 print(title_tag.parent) #
The Dormouse's story124 #获取所有title_tag的父节点
125 #结果:
126 #head
127 #html
128 #[document]
129 for parent intitle_tag.parents:130 print(parent.name)131
132 sibling_soup = BeautifulSoup("text1text2", "html.parser")133 #通过.next_sibling获取节点的下一个兄弟节点
134 print(sibling_soup.b.next_sibling) #text2
135 print(sibling_soup.c.next_sibling) #None
136 #通过.previous_sibling获取节点的前一个系统第节点
137 print(sibling_soup.c.previous_sibling) #text1
138 print(sibling_soup.b.previous_sibling) #None
139 #通过.next_siblings查找a标签的所有兄弟节点
140 #结果:
141 #',\n'
142 #Lacie
143 #' and\n'
144 #Tillie
145 #';\nand they lived at the bottom of a well.'
146 for sibling insoup.a.next_siblings:147 print(repr(sibling))148 #通过.previous_siblings查找id为link3的所有前置兄弟节点
149 #结果:
150 #' and\n'
151 #Lacie
152 #',\n'
153 #Elsie
154 #'Once upon a time there were three little sisters; and their names were\n'
155 for sibling in soup.find(id="link3").previous_siblings:156 print(repr(sibling))157
158 last_a_tag = soup.find("a", id="link3")159 #查找最后一个a标签的下一个被解析的对象
160 #和next_sibling 区别在于是被解析的下一个对象,不是下一个对象
161 print(last_a_tag.next_element) #Tillie
162 #查找最后一个a标签的上一个被解析的对象
163 print(repr(last_a_tag.previous_element)) #' and\n'
164 #查找最后一个a标签之后的所有被解析对象
165 #结果:
166 #'Tillie'
167 #';\nand they lived at the bottom of a well.'
168 #'\n'
169 #
...
170 #'...'
171 #'\n'
172 for element inlast_a_tag.next_elements:173 print(repr(element))174
175 #查找文档的所有b标签
176 print(soup.find_all('b')) #[The Dormouse's story]
177 #查找所有以b开头的标签
178 #结果:
179 #body
180 #b
181 for tag in soup.find_all(re.compile("^b")):182 print(tag.name)183 #查找所有包含t的标签
184 #结果:
185 #html
186 #title
187 for tag in soup.find_all(re.compile("t")):188 print(tag.name)189 #传入一个列表查找元素
190 #结果:[The Dormouse's story, Elsie,
191 #Lacie,
192 #href="http://example.com/tillie" id="link3">Tillie
]193 print(soup.find_all(['a', 'b']))194 #匹配所有元素,但是不会返回字符串节点
195 for tag insoup.find_all(True):196 print(tag.name)197
198
199 #定义过滤方法
200 defhas_class_but_no_id(tag):201 return tag.has_attr('class') and not tag.has_attr('id')202
203
204 #通过自定义方法实现过滤
205 #结果: [
The Dormouse's story
,Once upon a time there were
206 #three little sisters; and their names were Elsie,
207 #Lacie and
208 #href="http://example.com/tillie" id="link3">Tillie
; and they lived at the bottom of a well.,209 #
...
]210 print(soup.find_all(has_class_but_no_id))211 #查找id为link2的元素
212 print(soup.find_all(id='link2')) #[Lacie]
213 #查找href包含elsie的元素
214 print(soup.find_all(href=re.compile("elsie"))) #[Elsie]
215 #查找所有包含id属性的元素
216 #结果: [Elsie,
217 #href="http://example.com/lacie" id="link2">Lacie
, ]219 print(soup.find_all(id=True))220 #多条件查找元素
221 #结果:
222 #[Elsie]
223 print(soup.find_all(href=re.compile("elsie"), id='link1'))224
225 data_soup = BeautifulSoup('
227 print(data_soup.find_all(attrs={"data-foo": "value"})) #[
228 #通过css类名查找元素,因为class是python的关键字,所以用class_代替、
, Lacie,231 #href="http://example.com/tillie" id="link3">Tillie
]232 print(soup.find_all('a', class_='sister'))233 #class_也可以用正则来过滤
234 print(soup.find_all(class_=re.compile("itl"))) #[
The Dormouse's story
]235
236
237 defhas_six_characters(css_class):238 return css_class is not None and len(css_class) == 6
239
240
241 #通过自定义过滤方法过滤元素
242 #结果:[Elsie,
243 #href="http://example.com/lacie" id="link2">Lacie
, ]245 print(soup.find_all(class_=has_six_characters))246 #查找文档中的字符串为Elsie的
247 print(soup.find_all(text="Elsie")) #['Elsie']
248 #正则表达式查找text
249 print(soup.find_all(text=re.compile("Dormouse"))) #["The Dormouse's story", "The Dormouse's story"]
250 #通过limit限制返回的结果集数量
251 #结果:
, Lacie]254 print(soup.find_all("a", limit=2))255 #默认会查找文档的所有子孙节点,如果recursive指定为False则只会查找子节点
256 print(soup.find_all('title', recursive=False)) #[]
257 #等价于 soup.find_all("a")
258 print(soup("a"))259 #等价于 soup.title.find_all(text=True)
260 print(soup.title(text=True))261 #find用法与find_all用法基本一致,区别如下:
262 #1、find返回找到元素的第一个元素,find_all返回所有
263 #2、如果没有找到元素,find返回None,find_all返回空集合
264 print(soup.find("a")) #Elsie
265
266 a_string = soup.find(text='Lacie')267 #找到a_string元素的父节点是a的所有元素
268 print(a_string.find_parents("a")) #[Lacie]
269 #找到a_string元素的父节点是p的第一个元素
270 #结果:
271 #
Once upon a time there were three little sisters; and their names were
272 #Elsie,
273 #Lacie and
274 #Tillie;
275 #and they lived at the bottom of a well.
276 print(a_string.find_parent("p"))277 #查找a_string元素的父节点是p,class为title的所有元素
278 print(a_string.find_parents("p", class_="title")) #[]
279
280 first_link =soup.a281 #查找第一个a标签的所有是a的兄弟元素
282 #结果: [Lacie,
283 #href="http://example.com/tillie" id="link3">Tillie
]284 print(first_link.find_next_siblings("a"))285
286 first_story_paragraph = soup.find("p", "story")287 #查找first_story_paragraph的下一个标签的p的兄弟标签
288 print(first_story_paragraph.find_next_sibling("p")) #
...
289
290 last_link = soup.find("a", id="link3")291 #查找last_link的前一个标签是a的所有兄弟标签
292 #结果: [Lacie,
293 #Elsie]
294 print(last_link.find_previous_siblings("a"))295 #查找last_link的前一个标签是a的兄弟标签
296 print(last_link.find_previous_sibling("a")) #Lacie
297
298 first_link =soup.a299 #查找first_link之后的所有有字符串的节点
300 #结果: ['Elsie', ',\n', 'Lacie', ' and\n', 'Tillie', ';\nand they lived at the bottom of a well.', '\n', '...', '\n']
301 print(first_link.find_all_next(text=True))302 #查找first_link之后的第一个p标签
303 print(first_link.find_next("p")) #
...
304 #查找first_link之前的所有p标签
305 #结果:[
Once upon a time there were three little sisters; and their names were
306 #Elsie,
307 #Lacie and
308 #Tillie;
309 #and they lived at the bottom of a well.
,The Dormouse's story
]310 print(first_link.find_all_previous("p"))311 #查找first_link的前一个title元素
312 print(first_link.find_previous("title")) #
The Dormouse's story313
314
315 #CSS 选择器
316 #通过css选择器来查找标签为title的元素
317 print(soup.select("title")) #[
The Dormouse's story]318 #查找是p元素的第三个元素
319 print(soup.select("p:nth-of-type(3)")) #[
...
]320 #逐级查找body下的所有a标签
321 #结果: [Elsie,
322 #href="http://example.com/lacie" id="link2">Lacie
, ]324 print(soup.select("body a"))325 #逐级查找html下的head虾的title元素
326 print(soup.select("html head title")) #[
The Dormouse's story]327 #查找head元素下的直接子title元素
328 print(soup.select("head > title")) #[
The Dormouse's story]329 #查找p元素下子元素id为link1的元素
330 print(soup.select("p > #link1")) #[Elsie]
331 #查找body下的子元素为a的元素,不会逐级查找
332 print(soup.select("body > a")) #[]
333 #查找id为link1的所有class为sister的兄弟节点
334 #结果:[Lacie,
335 #Tillie]
336 print(soup.select("#link1 ~ .sister"))337 #通过css类型sister查找元素
338 #结果:[Elsie,
339 #href="http://example.com/lacie" id="link2">Lacie
, ]341 print(soup.select(".sister"))342 #通过id来查找元素
343 print(soup.select("#link1")) #[Elsie]
344 #查找所有a标签包含href属性的
345 #结果:[Elsie,
346 #href="http://example.com/lacie" id="link2">Lacie
, ]348 print(soup.select("a[href]"))349 #根据a标签的href属性值查找元素
350 #结果:[Elsie]
351 print(soup.select('a[href="http://example.com/elsie"]'))352 #根据a标签的href前缀查找元素
353 #结果:[Elsie,
354 #href="http://example.com/lacie" id="link2">Lacie
, ]356 print(soup.select('a[href^="http://example.com"]'))357 #查找所有a标签的href值是以tillie结尾的
358 #结果:[Tillie]
359 print(soup.select('a[href$="tillie"]'))360 #查找所有href的值与表达式相匹配的a标签
361 print(soup.select('a[href*=".com/el"]'))362
363
364 #修改文档树
365 soup = BeautifulSoup('Extremely bold', "html.parser")366 tag =soup.b367 #修改标签的name
368 tag.name = "blockquote"
369 #修改标签的class
370 tag['class'] = "verybold"
371 #新增标签的id属性
372 tag['id'] = 1
373 print(tag) #
Extremely bold
374 #通过.string修改标签的内容
375 tag.string = "New link text."
376 print(tag) #
New link text.
377
378 soup = BeautifulSoup("Foo", "html.parser")379 #对指定标签增加内容
380 soup.a.append("Bar")381 print(soup.a) #FooBar
382 #通过new_string()方法创建一个字符串对象
383 new_string = soup.new_string("New content")384 soup.a.append(new_string)385 print(soup.a) #FooBarNew content
386 #创建一个注释对象
387 new_comment = soup.new_string("I am comment.", Comment)388 soup.a.append(new_comment)389 print(soup.a) #FooBarNew content
390
391 soup = BeautifulSoup("", "html.parser")392 original_tag =soup.b393 #通过new_tag()方法创建一个新的标签
394 new_tag = soup.new_tag("a", href="http://www.example.com")395 original_tag.append(new_tag)396 print(original_tag) #
397
398 markup = 'I linked to example.com'
399 soup = BeautifulSoup(markup, "html.parser")400 tag =soup.a401 #通过insert()方法将制定内容插入对应的下标下
402 tag.insert(1, "but did not endorse")403 print(tag) #I linked to but did not endorseexample.com
404
405 soup = BeautifulSoup("stop", "html.parser")406 tag = soup.new_tag("i")407 tag.string = "Don't"
408 #通过insert_before()方法在当前tag或者文本节点前插入内容
409 soup.b.string.insert_before(tag)410 print(soup) #Don'tstop
411 #通过insert_after() 方法在当前tag或文本节点后插入内容
412 soup.b.i.insert_after(soup.new_string("no no"))413 print(soup) #Don't no no stop
414
415 markup = 'I linked to example.com'
416 soup = BeautifulSoup(markup, 'html.parser')417 tag =soup.a418 #通过clear() 方法移除当前tag的内容
419 tag.clear()420 print(tag) #
421
422 markup = 'I linked to example.com'
423 soup = BeautifulSoup(markup, 'html.parser')424 a_tag =soup.a425 #通过extract() 方法将当前tag移除文档树,并作为方法结果返回
426 i_tag =soup.i.extract()427 print(a_tag) #I linked to
428 print(i_tag) #example.com
429
430 markup = 'I linked to example.com'
431 soup = BeautifulSoup(markup, 'html.parser')432 a_tag =soup.a433 #通过decompose() 方法将当前节点移除文档树并完全销毁
434 i_tag =soup.i.decompose()435 print(a_tag) #I linked to
436 print(i_tag) #None
437
438 markup = 'I linked to example.com'
439 soup = BeautifulSoup(markup, 'html.parser')440 a_tag =soup.a441 new_tag = soup.new_tag("b")442 new_tag.string = "example.net"
443 #通过replace_with() 方法移除文档树中的某段内容,并用新tag或文本节点替代它
444 a_tag.i.replace_with(new_tag)445 print(a_tag) #I linked to example.net
446
447 soup = BeautifulSoup("
I wish I was bold.
", 'html.parser')448 #通过wrap() 方法可以对指定的tag元素进行包装449 soup.p.string.wrap(soup.new_tag("b"))450 print(soup) #
I wish I was bold.
451
452 markup = 'I linked to example.com'
453 soup = BeautifulSoup(markup, 'html.parser')454 a_tag =soup.a455 #unwrap() 方法与 wrap() 方法相反.将移除tag内的所有tag标签,该方法常被用来进行标记的解包
456 a_tag.i.unwrap()457 print(a_tag) #I linked to example.com
458
459 markup = '\nI linked to example.com\n'
460 soup = BeautifulSoup(markup, 'html.parser')461 #如果只想得到tag中包含的文本内容,那么可以嗲用 get_text() 方法,这个方法获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回:
462 print(repr(soup.get_text())) #'\nI linked to example.com\n'
463 #可以通过参数指定tag的文本内容的分隔符
464 print(repr(soup.get_text("|"))) #'\nI linked to |example.com|\n'
465 #还可以去除获得文本内容的前后空白
466 print(repr(soup.get_text("|", strip=True))) #'I linked to|example.com'