python is beautiful_【Python初级爬虫系列--01】python beautifulsoup4 HTML解析器详细用法...

1 importre2

3 from bs4 importBeautifulSoup, Comment4

5 html_doc = """

The Dormouse's story6 7

The Dormouse's story

8

9

Once upon a time there were three little sisters; and their names were10 Elsie,11 Lacie and12 Tillie;13 and they lived at the bottom of a well.

14

15

...

16 """

17 #或者打开某个html文件 soup = BeautifulSoup(open("index.html"))

18 soup = BeautifulSoup(html_doc, "html.parser")19

20 tag =soup.p21 #获取标签的类型

22 print(type(tag)) #

23 #获取标签的名字

24 print(tag.name) #p

25 #获取标签的class属性的值

26 print(tag['class']) #['title']

27 #获取标签的所有属性

28 print(tag.attrs) #{'class': ['title']}

29

30 css_soup = BeautifulSoup('

', "html.parser")31 #获取多值属性

32 print(css_soup.p['class']) #['body', 'strikeout']

33 #获取标签内容

34 print(tag.string) #The Dormouse's story

35 #获取标签内容的类型,字符串用NavigableString来包装

36 print(type(tag.string)) #

37

38 #将标签的内容替换

39 tag.string.replace_with("No longer bold")40 print(tag) #

No longer bold

41

42 #BeautifulSoup 对象表示的是一个文档的全部内容

43 print(soup.name) #[document]

44

45 markup = ""

46 soup = BeautifulSoup(markup, "html.parser")47 comment =soup.b.string48 #获取注释的类型Comment,Comment 对象是一个特殊类型的 NavigableString 对象

49 print(type(comment)) #

50 #以漂亮的格式输出

51 #

52 #

53 #

54 print(soup.b.prettify())55

56 #遍历文档树

57 soup = BeautifulSoup(html_doc, "html.parser")58 #直接获取head节点

59 print(soup.head) #

The Dormouse's story

60 #直接获取title节点

61 print(soup.title) #

The Dormouse's story

62 #获取body元素内的第一个b标签

63 print(soup.body.b) #The Dormouse's story

64 #获取文档内的第一个a标签

65 print(soup.a) #Elsie

66 #获取文档内的所有a标签

67 #[Elsie,

68 #href="http://example.com/lacie" id="link2">Lacie

,

69 #id="link3">Tillie

]

70 print(soup.find_all('a'))71 #通过.contents 获取head标签的所有子节点,以列表形式返回

72 print(soup.head.contents) #[

The Dormouse's story]

73 #通过.children获取head的所有子节点

74 #结果:

The Dormouse's story

75 for child insoup.head.children:76 print(child)77 #通过.descendants获取head的所有子孙节点

78 #结果:

79 #

The Dormouse's story

80 #The Dormouse's story

81 for child insoup.head.descendants:82 print(child)83 #获取整个文档的子节点也就是一个html节点

84 print(len(list(soup.children))) #1

85 #获取整个文档的所有子孙节点的数量

86 print(len(list(soup.descendants))) #26

87 #获取文档内的所有的字符串

88 #结果:

89 #"The Dormouse's story"

90 #'\n'

91 #'\n'

92 #"The Dormouse's story"

93 #'\n'

94 #'Once upon a time there were three little sisters; and their names were\n'

95 #'Elsie'

96 #',\n'

97 #'Lacie'

98 #' and\n'

99 #'Tillie'

100 #';\nand they lived at the bottom of a well.'

101 #'\n'

102 #'...'

103 #'\n'

104 for string insoup.strings:105 print(repr(string))106 #获取文档内的所有字符串,去除多余的空白

107 #结果:

108 #"The Dormouse's story"

109 #"The Dormouse's story"

110 #'Once upon a time there were three little sisters; and their names were'

111 #'Elsie'

112 #','

113 #'Lacie'

114 #'and'

115 #'Tillie'

116 #';\nand they lived at the bottom of a well.'

117 #'...'

118 for string insoup.stripped_strings:119 print(repr(string))120

121 title_tag =soup.title122 #通过.parent 查找title节点的父节点

123 print(title_tag.parent) #

The Dormouse's story

124 #获取所有title_tag的父节点

125 #结果:

126 #head

127 #html

128 #[document]

129 for parent intitle_tag.parents:130 print(parent.name)131

132 sibling_soup = BeautifulSoup("text1text2", "html.parser")133 #通过.next_sibling获取节点的下一个兄弟节点

134 print(sibling_soup.b.next_sibling) #text2

135 print(sibling_soup.c.next_sibling) #None

136 #通过.previous_sibling获取节点的前一个系统第节点

137 print(sibling_soup.c.previous_sibling) #text1

138 print(sibling_soup.b.previous_sibling) #None

139 #通过.next_siblings查找a标签的所有兄弟节点

140 #结果:

141 #',\n'

142 #Lacie

143 #' and\n'

144 #Tillie

145 #';\nand they lived at the bottom of a well.'

146 for sibling insoup.a.next_siblings:147 print(repr(sibling))148 #通过.previous_siblings查找id为link3的所有前置兄弟节点

149 #结果:

150 #' and\n'

151 #Lacie

152 #',\n'

153 #Elsie

154 #'Once upon a time there were three little sisters; and their names were\n'

155 for sibling in soup.find(id="link3").previous_siblings:156 print(repr(sibling))157

158 last_a_tag = soup.find("a", id="link3")159 #查找最后一个a标签的下一个被解析的对象

160 #和next_sibling 区别在于是被解析的下一个对象,不是下一个对象

161 print(last_a_tag.next_element) #Tillie

162 #查找最后一个a标签的上一个被解析的对象

163 print(repr(last_a_tag.previous_element)) #' and\n'

164 #查找最后一个a标签之后的所有被解析对象

165 #结果:

166 #'Tillie'

167 #';\nand they lived at the bottom of a well.'

168 #'\n'

169 #

...

170 #'...'

171 #'\n'

172 for element inlast_a_tag.next_elements:173 print(repr(element))174

175 #查找文档的所有b标签

176 print(soup.find_all('b')) #[The Dormouse's story]

177 #查找所有以b开头的标签

178 #结果:

179 #body

180 #b

181 for tag in soup.find_all(re.compile("^b")):182 print(tag.name)183 #查找所有包含t的标签

184 #结果:

185 #html

186 #title

187 for tag in soup.find_all(re.compile("t")):188 print(tag.name)189 #传入一个列表查找元素

190 #结果:[The Dormouse's story, Elsie,

191 #Lacie,

192 #href="http://example.com/tillie" id="link3">Tillie

]

193 print(soup.find_all(['a', 'b']))194 #匹配所有元素,但是不会返回字符串节点

195 for tag insoup.find_all(True):196 print(tag.name)197

198

199 #定义过滤方法

200 defhas_class_but_no_id(tag):201 return tag.has_attr('class') and not tag.has_attr('id')202

203

204 #通过自定义方法实现过滤

205 #结果: [

The Dormouse's story

,

Once upon a time there were

206 #three little sisters; and their names were Elsie,

207 #Lacie and

208 #href="http://example.com/tillie" id="link3">Tillie

; and they lived at the bottom of a well.,

209 #

...

]

210 print(soup.find_all(has_class_but_no_id))211 #查找id为link2的元素

212 print(soup.find_all(id='link2')) #[Lacie]

213 #查找href包含elsie的元素

214 print(soup.find_all(href=re.compile("elsie"))) #[Elsie]

215 #查找所有包含id属性的元素

216 #结果: [Elsie,

217 #href="http://example.com/lacie" id="link2">Lacie

,

218 #id="link3">Tillie

]

219 print(soup.find_all(id=True))220 #多条件查找元素

221 #结果:

222 #[Elsie]

223 print(soup.find_all(href=re.compile("elsie"), id='link1'))224

225 data_soup = BeautifulSoup('

foo!
', 'html.parser')226 #对于一些特殊的属性,可以通过attrs的形式查找标签

227 print(data_soup.find_all(attrs={"data-foo": "value"})) #[

foo!
]

228 #通过css类名查找元素,因为class是python的关键字,所以用class_代替、

229 #结果: [

230 #id="link1">Elsie

, Lacie,

231 #href="http://example.com/tillie" id="link3">Tillie

]

232 print(soup.find_all('a', class_='sister'))233 #class_也可以用正则来过滤

234 print(soup.find_all(class_=re.compile("itl"))) #[

The Dormouse's story

]

235

236

237 defhas_six_characters(css_class):238 return css_class is not None and len(css_class) == 6

239

240

241 #通过自定义过滤方法过滤元素

242 #结果:[Elsie,

243 #href="http://example.com/lacie" id="link2">Lacie

,

244 #id="link3">Tillie

]

245 print(soup.find_all(class_=has_six_characters))246 #查找文档中的字符串为Elsie的

247 print(soup.find_all(text="Elsie")) #['Elsie']

248 #正则表达式查找text

249 print(soup.find_all(text=re.compile("Dormouse"))) #["The Dormouse's story", "The Dormouse's story"]

250 #通过limit限制返回的结果集数量

251 #结果:

252 #[

253 #="link1">Elsie

, Lacie]

254 print(soup.find_all("a", limit=2))255 #默认会查找文档的所有子孙节点,如果recursive指定为False则只会查找子节点

256 print(soup.find_all('title', recursive=False)) #[]

257 #等价于 soup.find_all("a")

258 print(soup("a"))259 #等价于 soup.title.find_all(text=True)

260 print(soup.title(text=True))261 #find用法与find_all用法基本一致,区别如下:

262 #1、find返回找到元素的第一个元素,find_all返回所有

263 #2、如果没有找到元素,find返回None,find_all返回空集合

264 print(soup.find("a")) #Elsie

265

266 a_string = soup.find(text='Lacie')267 #找到a_string元素的父节点是a的所有元素

268 print(a_string.find_parents("a")) #[Lacie]

269 #找到a_string元素的父节点是p的第一个元素

270 #结果:

271 #

Once upon a time there were three little sisters; and their names were

272 #Elsie,

273 #Lacie and

274 #Tillie;

275 #and they lived at the bottom of a well.

276 print(a_string.find_parent("p"))277 #查找a_string元素的父节点是p,class为title的所有元素

278 print(a_string.find_parents("p", class_="title")) #[]

279

280 first_link =soup.a281 #查找第一个a标签的所有是a的兄弟元素

282 #结果: [Lacie,

283 #href="http://example.com/tillie" id="link3">Tillie

]

284 print(first_link.find_next_siblings("a"))285

286 first_story_paragraph = soup.find("p", "story")287 #查找first_story_paragraph的下一个标签的p的兄弟标签

288 print(first_story_paragraph.find_next_sibling("p")) #

...

289

290 last_link = soup.find("a", id="link3")291 #查找last_link的前一个标签是a的所有兄弟标签

292 #结果: [Lacie,

293 #Elsie]

294 print(last_link.find_previous_siblings("a"))295 #查找last_link的前一个标签是a的兄弟标签

296 print(last_link.find_previous_sibling("a")) #Lacie

297

298 first_link =soup.a299 #查找first_link之后的所有有字符串的节点

300 #结果: ['Elsie', ',\n', 'Lacie', ' and\n', 'Tillie', ';\nand they lived at the bottom of a well.', '\n', '...', '\n']

301 print(first_link.find_all_next(text=True))302 #查找first_link之后的第一个p标签

303 print(first_link.find_next("p")) #

...

304 #查找first_link之前的所有p标签

305 #结果:[

Once upon a time there were three little sisters; and their names were

306 #Elsie,

307 #Lacie and

308 #Tillie;

309 #and they lived at the bottom of a well.

,

The Dormouse's story

]

310 print(first_link.find_all_previous("p"))311 #查找first_link的前一个title元素

312 print(first_link.find_previous("title")) #

The Dormouse's story

313

314

315 #CSS 选择器

316 #通过css选择器来查找标签为title的元素

317 print(soup.select("title")) #[

The Dormouse's story]

318 #查找是p元素的第三个元素

319 print(soup.select("p:nth-of-type(3)")) #[

...

]

320 #逐级查找body下的所有a标签

321 #结果: [Elsie,

322 #href="http://example.com/lacie" id="link2">Lacie

,

323 #id="link3">Tillie

]

324 print(soup.select("body a"))325 #逐级查找html下的head虾的title元素

326 print(soup.select("html head title")) #[

The Dormouse's story]

327 #查找head元素下的直接子title元素

328 print(soup.select("head > title")) #[

The Dormouse's story]

329 #查找p元素下子元素id为link1的元素

330 print(soup.select("p > #link1")) #[Elsie]

331 #查找body下的子元素为a的元素,不会逐级查找

332 print(soup.select("body > a")) #[]

333 #查找id为link1的所有class为sister的兄弟节点

334 #结果:[Lacie,

335 #Tillie]

336 print(soup.select("#link1 ~ .sister"))337 #通过css类型sister查找元素

338 #结果:[Elsie,

339 #href="http://example.com/lacie" id="link2">Lacie

,

340 #id="link3">Tillie

]

341 print(soup.select(".sister"))342 #通过id来查找元素

343 print(soup.select("#link1")) #[Elsie]

344 #查找所有a标签包含href属性的

345 #结果:[Elsie,

346 #href="http://example.com/lacie" id="link2">Lacie

,

347 #id="link3">Tillie

]

348 print(soup.select("a[href]"))349 #根据a标签的href属性值查找元素

350 #结果:[Elsie]

351 print(soup.select('a[href="http://example.com/elsie"]'))352 #根据a标签的href前缀查找元素

353 #结果:[Elsie,

354 #href="http://example.com/lacie" id="link2">Lacie

,

355 #id="link3">Tillie

]

356 print(soup.select('a[href^="http://example.com"]'))357 #查找所有a标签的href值是以tillie结尾的

358 #结果:[Tillie]

359 print(soup.select('a[href$="tillie"]'))360 #查找所有href的值与表达式相匹配的a标签

361 print(soup.select('a[href*=".com/el"]'))362

363

364 #修改文档树

365 soup = BeautifulSoup('Extremely bold', "html.parser")366 tag =soup.b367 #修改标签的name

368 tag.name = "blockquote"

369 #修改标签的class

370 tag['class'] = "verybold"

371 #新增标签的id属性

372 tag['id'] = 1

373 print(tag) #

Extremely bold

374 #通过.string修改标签的内容

375 tag.string = "New link text."

376 print(tag) #

New link text.

377

378 soup = BeautifulSoup("Foo", "html.parser")379 #对指定标签增加内容

380 soup.a.append("Bar")381 print(soup.a) #FooBar

382 #通过new_string()方法创建一个字符串对象

383 new_string = soup.new_string("New content")384 soup.a.append(new_string)385 print(soup.a) #FooBarNew content

386 #创建一个注释对象

387 new_comment = soup.new_string("I am comment.", Comment)388 soup.a.append(new_comment)389 print(soup.a) #FooBarNew content

390

391 soup = BeautifulSoup("", "html.parser")392 original_tag =soup.b393 #通过new_tag()方法创建一个新的标签

394 new_tag = soup.new_tag("a", href="http://www.example.com")395 original_tag.append(new_tag)396 print(original_tag) #

397

398 markup = 'I linked to example.com'

399 soup = BeautifulSoup(markup, "html.parser")400 tag =soup.a401 #通过insert()方法将制定内容插入对应的下标下

402 tag.insert(1, "but did not endorse")403 print(tag) #I linked to but did not endorseexample.com

404

405 soup = BeautifulSoup("stop", "html.parser")406 tag = soup.new_tag("i")407 tag.string = "Don't"

408 #通过insert_before()方法在当前tag或者文本节点前插入内容

409 soup.b.string.insert_before(tag)410 print(soup) #Don'tstop

411 #通过insert_after() 方法在当前tag或文本节点后插入内容

412 soup.b.i.insert_after(soup.new_string("no no"))413 print(soup) #Don't no no stop

414

415 markup = 'I linked to example.com'

416 soup = BeautifulSoup(markup, 'html.parser')417 tag =soup.a418 #通过clear() 方法移除当前tag的内容

419 tag.clear()420 print(tag) #

421

422 markup = 'I linked to example.com'

423 soup = BeautifulSoup(markup, 'html.parser')424 a_tag =soup.a425 #通过extract() 方法将当前tag移除文档树,并作为方法结果返回

426 i_tag =soup.i.extract()427 print(a_tag) #I linked to

428 print(i_tag) #example.com

429

430 markup = 'I linked to example.com'

431 soup = BeautifulSoup(markup, 'html.parser')432 a_tag =soup.a433 #通过decompose() 方法将当前节点移除文档树并完全销毁

434 i_tag =soup.i.decompose()435 print(a_tag) #I linked to

436 print(i_tag) #None

437

438 markup = 'I linked to example.com'

439 soup = BeautifulSoup(markup, 'html.parser')440 a_tag =soup.a441 new_tag = soup.new_tag("b")442 new_tag.string = "example.net"

443 #通过replace_with() 方法移除文档树中的某段内容,并用新tag或文本节点替代它

444 a_tag.i.replace_with(new_tag)445 print(a_tag) #I linked to example.net

446

447 soup = BeautifulSoup("

I wish I was bold.

", 'html.parser')448 #通过wrap() 方法可以对指定的tag元素进行包装

449 soup.p.string.wrap(soup.new_tag("b"))450 print(soup) #

I wish I was bold.

451

452 markup = 'I linked to example.com'

453 soup = BeautifulSoup(markup, 'html.parser')454 a_tag =soup.a455 #unwrap() 方法与 wrap() 方法相反.将移除tag内的所有tag标签,该方法常被用来进行标记的解包

456 a_tag.i.unwrap()457 print(a_tag) #I linked to example.com

458

459 markup = '\nI linked to example.com\n'

460 soup = BeautifulSoup(markup, 'html.parser')461 #如果只想得到tag中包含的文本内容,那么可以嗲用 get_text() 方法,这个方法获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回:

462 print(repr(soup.get_text())) #'\nI linked to example.com\n'

463 #可以通过参数指定tag的文本内容的分隔符

464 print(repr(soup.get_text("|"))) #'\nI linked to |example.com|\n'

465 #还可以去除获得文本内容的前后空白

466 print(repr(soup.get_text("|", strip=True))) #'I linked to|example.com'

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值