爬虫--2.三种解析库（XPath、Beautiful Soup、PyQuery)

最新推荐文章于 2024-02-02 03:53:26 发布

Amanda_ABAP_Python

最新推荐文章于 2024-02-02 03:53:26 发布

阅读量324

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/Amanda_python/article/details/110123565

版权

python 专栏收录该内容

101 篇文章

订阅专栏

下面将以代码的形式展现，熟练掌握怎么用代码输出：

XPath

from lxml import etree

f = open('./my.html','r',encoding='utf-8')
content = f.read()
f.close()

# 解析HTML⽂档，返回根节点对象
html = etree.HTML(content)

# 获取⽹⻚中所有标签并遍历输出标签名
result = html.xpath("//*") #  获取所有子节点
result = html.xpath("/*") # 获取直接子节点
result = html.xpath("//li") # 获取所有li节点
result = html.xpath("//li/a")# 获取所有li节点下的所有直接a⼦节点
result = html.xpath("//a/..") #获取所有a节点的⽗节点

for t in result:
    print(t.tag,end=" ")
print()


# 获取id属性为hid的h3节点中的⽂本内容
print(html.xpath("//h3[@id='hid']/text()")) #输出内容：['我的常⽤链接']

# 解析网页中所有超链接信息
result = html.xpath("//li")  # 获取所有li节点
result = html.xpath("//li[@class='item-0']") #获取所有class属性为'item-0'的li节点
result = html.xpath("//li[contains(@class,'shop')]") #获取class属性值中含有shop的li节点



for t in result:
    a = t.find("a")
    print(a.text,":",a.get("href"))


'''
HTML元素的属性：
 tag：元素标签名
 text：标签中间的⽂本
HTML元素的⽅法：
 find() 查找⼀个匹配的元素
 findall() 查找所有匹配的元素
 get(key, default=None) 获取指定属性值
 items（）获取元素属性，作为序列返回
 keys（）获取属性名称列表
 value是（）将元素属性值作为字符串序列
'''

Beautiful Soup

// An highlighted block
var foo = 'bar';# 导⼊模块
from bs4 import BeautifulSoup

# 读取html⽂件信息（在真实代码中是爬取的⽹⻚信息）
f = open('./my_1.html','r',encoding='utf-8')
content = f.read()
f.close()

# 创建解析对象
soup = BeautifulSoup(content,'lxml')

#第一种：节点选择器解析
print(soup.title) # 输出结果：<title>我的网页</title>

#输出⽹⻚中title标签中的内容
print(soup.title.string) # 输出结果：我的网页

print(soup.h3)  # 输出结果：<h3 id="hid">我的常用链接</h3>
print(soup.li)#获取第⼀个li元素标签. 输出结果：<li class="item-0"><a href="http://www.baidu.com">百度</a></li>
print(soup.ul)
print(soup.ul.contents)


# 获取ul，并从ul中获取所有子节点
blist = soup.ul.children
print(blist)    # 输出结果：<list_iterator object at 0x0000026712FD80D0>
#所以需要去遍历输出：
for li in blist:
    # print(li.name)   # 输出结果：None li None li None li None li None li None
    # 这里的None实际上代表的是换行符，想要判断必须是li节点，去除这些None
    if li.name == 'li':
            a = li.a
            print(a.string,":",a.attrs['href']) # 输出a标签节点的内容和属性
            '''输出结果：
            百度 : http://www.baidu.com
            京东 : http://www.jd.com
            搜狐 : http://www.sohu.com
            新浪 : http://www.sina.com
            淘宝 : http://www.taobao.com
          '''

# 第二种：方法选择器：
blist = soup.find_all("li") # 获取所有li节点
for li in blist:
    a = li.find("a") # 获取所有li中a节点
    # print(a.string,":",a.attrs['href'])
    print(a.get_text(),":",a.attrs['href'])


# 第二种：CSS选择器：
print(soup.select("li")) #获取所有li节点
print(soup.select("li.shop")) #获取含有shop的li节点
print(soup.select("ul li a")) #获取ul⾥⾯li下⾯的a元素节点
blist = soup.select("ul li")
for li in blist:
    a = li.select("a")[0] # 获取li中a节点,加[0]是因为生成的是个列表，所有要索引值读取出来
    # print(a)
    # print(li)
    print(a.get_text(),":",a.attrs['href'])

PyQuery
在这之前，有个问题：在进行url初始化的时候是可以正常运行的，但是需要导入本地文件的时候，就是一直报错，后面也找到了两种解决方法：

from pyquery import PyQuery as pq

# URL初始化-----这个是正常运行的
doc = pq(url="http://www.baidu.com",encoding="utf-8")
print(doc('title'))

# ⽂件初始化--这个一直是报错290：我的是win10系统，好像mac系统就不会报错
doc = pq(filename='my.html',encoding = 'utf-8')
print(doc('title'))

#针对文件初始化找到的解决方法1：
with open('my.html','r',encoding='UTF-8') as f:  # 打开新的文本
    text_new = f.read()
doc = pq(text_new)
print(doc('title'))

#方法2：是将my.html文件中的中文改成英文就可以正常运行

完整代码：

from pyquery import PyQuery as pq

# ⽂件初始化
with open('my.html','r',encoding='utf-8') as f:
    text_new = f.read()
doc = pq(text_new)
print(doc('title'))   # 输出结果：<title>我的网页</title>
print(doc('h3'))  # 输出结果：<h3 id="hid">我的常用链接</h3>
print(doc('#hid')) # 输出结果：<h3 id="hid">我的常用链接</h3>
print(doc('ul li'))
'''输出结果：
        <li class="item-0"><a href="http://www.baidu.com">百度</a></li>
        <li class="item-1 shop"><a href="http://www.jd.com">京东</a></li>
        <li class="item-2 shop"><a href="http://www.sohu.com">搜狐</a></li>
        <li class="item-3"><a href="http://www.sina.com">新浪</a></li>
        <li class="item-4 shop"><a href="http://www.taobao.com">淘宝</a></li>
'''
print(doc('ul li a'))# 获取ul li中的所有a标签
print(doc('a')) # 获取所有a标签
print(doc('a:first')) #获取网页中第一个a标签
print(doc('a:last')) #获取网页中最后一个a标签

# 获取class属性值shop的所有节点
print(doc(".shop"))
print(doc("li.shop"))# 获取class属性值shop的所有li节点

print(doc("a[href *= 'jd'")) # 获取href属性值中含有'jd'的a节点

#获取网页中所有ul中li里的a节点
#方法1：
alist = doc("ul li a")
print(alist)
'''输出结果：
<a href="http://www.baidu.com">百度</a>
<a href="http://www.jd.com">京东</a>
<a href="http://www.sohu.com">搜狐</a>
<a href="http://www.sina.com">新浪</a>
<a href="http://www.taobao.com">淘宝</a>
'''
print(alist.attr.href) # 输出结果：http://www.baidu.com
print(alist.html()) # 输出结果：百度  都是输出第一个数值，所以最好是能够去进行遍历输出

#方法2-----遍历输出：
for a in alist.items():
    print(a)
    print(a.attr.href)
    #print(a.text())/print(a.html())
    print(a.text(),":",a.attr.href)
'''输出结果：
<a href="http://www.baidu.com">百度</a>
<a href="http://www.jd.com">京东</a>
<a href="http://www.sohu.com">搜狐</a>
<a href="http://www.sina.com">新浪</a>
<a href="http://www.taobao.com">淘宝</a>
'''

'''输出结果：
http://www.baidu.com
http://www.jd.com
http://www.sohu.com
http://www.sina.com
http://www.taobao.com
'''

'''输出结果：
百度 : http://www.baidu.com
京东 : http://www.jd.com
搜狐 : http://www.sohu.com
新浪 : http://www.sina.com
淘宝 : http://www.taobao.com
'''