1.简介。
这两个库都是强大的网页解析库,处理高效,支持多种解析器。利用它们不用编写正则表达式即可方便的实现网页信息的提取。
2.代码。
BeautifulSoup库的标签选择器
#!/usr/bin/python
# -*- coding: UTF-8 -*-
html = """
<html>
<head>
<title>【i春秋】-专注网络安全_信息安全_白帽子的在线学习_教育_培训平台</title>
<meta name="description" content="i春秋(www.ichunqiu.com)专注网络安全、信息安全、白帽子技术的在线学习,教育、培训的平台。CISP持续教育培训平台,致力于为网络安全、信息安全、白帽子技术爱好者提供便捷,优质的视频教程,学习社区,在线实验评测、SRC部落等在线学习产品和服务,涵盖Web安全、漏洞分析、Android安全、iOS安全、企业安全等权威专业视频教程"</meta>
<meta name="keywords" content="i春秋,网络安全,信息安全,在线学习,在线教育,在线培训">
<meta name="viewport" content="width=device-width,initial-scale=0.29,maximum-scale=1">
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.prettify()) #自动补全HTML代码
print(soup.title.string) #获取title标签里的内容
print(soup.title) #获取title标签
print(soup.title.name) #获取标签名
print(soup.meta.attrs['name']) #获取标签元素的值
print(soup.meta['name'])
运行结果:
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z16.py
<html>
<head>
<title>
【i春秋】-专注网络安全_信息安全_白帽子的在线学习_教育_培训平台
</title>
<meta content="i春秋(www.ichunqiu.com)专注网络安全、信息安全、白帽子技术的在线学习,教育、培训的平台。CISP持续教育培训平台,致力于为网络安全、信息安全、白帽子技术爱好者提供便捷,优质的视频教程,学习社区,在线实验评测、SRC部落等在线学习产品和服务,涵盖Web安全、漏洞分析、Android安全、iOS安全、企业安全等权威专业视频教程" name="description"/>
<meta content="i春秋,网络安全,信息安全,在线学习,在线教育,在线培训" name="keywords"/>
<meta content="width=device-width,initial-scale=0.29,maximum-scale=1" name="viewport"/>
</head>
</html>
【i春秋】-专注网络安全_信息安全_白帽子的在线学习_教育_培训平台
<title>【i春秋】-专注网络安全_信息安全_白帽子的在线学习_教育_培训平台</title>
title
description
description
Process finished with exit code 0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
html = '''
<div class="concenter">
<!-- 快速链接 -->
<div class="fastlink">
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.div.contents) #子节点和子孙节点
print(soup.div.children) #迭代器
for i,child in enumerate(soup.div.children): #循环遍历遍历子节点
print(i,child) #打印节点内容和标签
print(soup.div.descendants) #迭代器
for i,child in enumerate(soup.div.descendants): #循环遍历所有的子孙节点
print(i,child)
print(soup.h3.parent)
print(soup.find('div')) #查找单个结果
print(soup.find_all('div')) #查找多个结果
for button in soup.select('button'): #获取标签内容
print(button.get_text())
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z17.py
['\n', ' 快速链接 ', '\n', <div class="fastlink">
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div></div>]
<list_iterator object at 0x000001969242BE80>
0
1 快速链接
2
3 <div class="fastlink">
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div></div>
<generator object Tag.descendants at 0x000001969240BD68>
0
1 快速链接
2
3 <div class="fastlink">
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div></div>
4
5 <div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div>
6
7 <h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
8
9 <button class="active">快速链接</button>
10 快速链接
11
12
13 <ul>
</ul>
14
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div>
<div class="concenter">
<!-- 快速链接 -->
<div class="fastlink">
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div></div></div>
[<div class="concenter">
<!-- 快速链接 -->
<div class="fastlink">
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div></div></div>, <div class="fastlink">
<div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div></div>, <div class="tabli">
<h3 class="clearfix">
<button class="active">快速链接</button>
</h3>
<ul>
</ul></div>]
快速链接
Process finished with exit code 0
pyquery库
初始化
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#字符串初始化
html = '''
<html>
<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="content-type" content="text/html;charset=gbk" />
<meta property="wb:webmaster" content="3aababe5ed22e23c" />
<meta name="referrer" content="always" />
<title>百度知道 - 全球最大中文互动问答平台</title>
<link rel="shortcut icon" href="//www.baidu.com/favicon.ico?t=20171027" type="image/x-icon" />
<link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg" />
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('meta'))
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z18.py
<meta http-equiv="X-UA-Compatible" content="IE=Edge"/>
<meta http-equiv="content-type" content="text/html;charset=gbk"/>
<meta property="wb:webmaster" content="3aababe5ed22e23c"/>
<meta name="referrer" content="always"/>
Process finished with exit code 0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#url初始化
from pyquery import PyQuery as pq
doc = pq(url = 'http://www.baidu.com')
print(doc('head'))
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z19.py
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head>
Process finished with exit code 0
多个元素遍历和单个元素遍历
#!/usr/bin/python
# -*- coding: UTF-8 -*-
html = '''
<html>
<head>
<meta class="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="content-type" content="text/html;charset=gbk" />
<meta property="wb:webmaster" content="3aababe5ed22e23c" />
<meta name="referrer" content="always" />
<title>百度知道 - 全球最大中文互动问答平台</title>
<link rel="shortcut icon" href="//www.baidu.com/favicon.ico?t=20171027" type="image/x-icon" />
<link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg" />
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.X-UA-Compatible')
print(li)
lis = doc('meta').items()
for meta in lis:
print(meta)
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z20.py
<meta class="X-UA-Compatible" content="IE=Edge"/>
<meta class="X-UA-Compatible" content="IE=Edge"/>
<meta http-equiv="content-type" content="text/html;charset=gbk"/>
<meta property="wb:webmaster" content="3aababe5ed22e23c"/>
<meta name="referrer" content="always"/>
Process finished with exit code 0
DOM操作
addclass removeClass
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#!/usr/bin/python
# -*- coding: UTF-8 -*-
html = '''
<html>
<head>
<meta class="X-UA-Compatible active" content="IE=Edge" />
<meta http-equiv="content-type" content="text/html;charset=gbk" />
<meta property="wb:webmaster" content="3aababe5ed22e23c" />
<meta name="referrer" content="always" />
<title>百度知道 - 全球最大中文互动问答平台</title>
<link rel="shortcut icon" href="//www.baidu.com/favicon.ico?t=20171027" type="image/x-icon" />
<link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg" />
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.X-UA-Compatible.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('active')
print(li)
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z21.py
<meta class="X-UA-Compatible active" content="IE=Edge"/>
<meta class="X-UA-Compatible" content="IE=Edge"/>
<meta class="X-UA-Compatible active" content="IE=Edge"/>
Process finished with exit code 0
DOM操作
attr、css
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#DOM操作 attr、css
html = '''
<html>
<head>
<meta class="X-UA-Compatible active" content="IE=Edge" />
<meta http-equiv="content-type" content="text/html;charset=gbk" />
<meta property="wb:webmaster" content="3aababe5ed22e23c" />
<meta name="referrer" content="always" />
<title>百度知道 - 全球最大中文互动问答平台</title>
<link rel="shortcut icon" href="//www.baidu.com/favicon.ico?t=20171027" type="image/x-icon" />
<link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg" />
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.X-UA-Compatible.active')
print(li)
li.attr('name','link')
print(li)
li.css('font-size','14px')
print(li)
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z22.py
<meta class="X-UA-Compatible active" content="IE=Edge"/>
<meta class="X-UA-Compatible active" content="IE=Edge" name="link"/>
<meta class="X-UA-Compatible active" content="IE=Edge" name="link" style="font-size: 14px"/>
Process finished with exit code 0
remove
#!/usr/bin/python
# -*- coding: UTF-8 -*-
html = '''
<div class='wrap'>
hello,world
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #find方法找到p标签并删除p标签的内容
print(wrap.text())
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z23.py
hello,world
This is a paragraph.
hello,world
Process finished with exit code 0
伪类选择器
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#伪类选择器
html = '''
<html>
<head>
<meta class="X-UA-Compatible active" content="IE=Edge" />
<meta http-equiv="content-type" content="text/html;charset=gbk" />
<meta property="wb:webmaster" content="3aababe5ed22e23c" />
<meta name="referrer" content="always" />
<title>百度知道 - 全球最大中文互动问答平台</title>
<link rel="shortcut icon" href="//www.baidu.com/favicon.ico?t=20171027" type="image/x-icon" />
<link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg" />
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('meta:first-child') #获取第一个元素
print(li)
li = doc('meta:nth-child(2)') #指定索引获取元素,索引从1开始
print(li)
li = doc('meta:gt(2)') #获取索引为2以后的元素,索引从零开始
print(li)
li = doc('meta:nth-child(2n)') #获取偶数元素
print(li)
D:\Anaconda3\python.exe C:/Users/lenovo/PycharmProjects/爬虫/z24.py
<meta class="X-UA-Compatible active" content="IE=Edge"/>
<meta http-equiv="content-type" content="text/html;charset=gbk"/>
<meta name="referrer" content="always"/>
<meta http-equiv="content-type" content="text/html;charset=gbk"/>
<meta name="referrer" content="always"/>
Process finished with exit code 0