# lxml 解释器使用
from bs4 import BeautifulSoup
soup = BeautifulSoup("<p>Hello</p>","lxml")
print(soup.p.string)
# BeautifulSoup基本用法
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse’s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse’s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!--Elsie--></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.prettify())
print('-------------')
print(soup.title.string)
# 节点选择器
# 选择元素
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse’s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse’s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!--Elsie--></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.title)
print(type(soup.title))
print('----------')
print(soup.title.string)
print(soup.head)
print(soup.p)
#
# <title>The Dormouse’s story</title>
# <class 'bs4.element.Tag'>
# ----------
# The Dormouse’s story
# <head><title>The Dormouse’s story</title></head>
# <p class="title" name="dromouse"><b>The Dormouse’s story</b></p>
# 嵌套选择
html = """
<html>
<head>
<title>The Dormouse 's story</title>
</head>
<body>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,"lxml")
print(soup.head.title)
print(type(soup.head.title))
print(soup.head.title.string)
# 关联选择
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse’s story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.p.contents)
# ['Once upon a time there were three little sisters; and their names were\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, '\nand they lived at the bottom of a well.']
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.children) # children 直接孩子,descendants子孙,parent双亲
for i, child in enumerate(soup.p.children):
print(i, child)
# 方法选择器
# find_all() find_all(name,attrs,recursive,text,**kwargs)
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay<lli>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name='ul'))
print(soup.find_all(attrs={'id':'list-2'}))
print(soup.find_all(id='list-2'))
print(soup.find_all(class_='element'))
print(type(soup.find_all(name='ul')[0]))
# find_all()方法,还有 find()方法,只不过后者返回的是单个元素,也就是第一个匹配的元
# 素,而前者返回的是所有匹配的元素组成的列表。
# CSS选择器
# select()方法
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-2">
<li class="element">Foo<lli>
<li class="element">Bar</li>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(type(soup.select('ul')[0]))
# 嵌套选择
for ul in soup.select('ul'):
print(ul.select('li'))
# 获取属性
print(ul['id'])
print(ul.attrs['id'])
# 获取文本
print('Get Text:',ul.get_text())
print('String:',ul.string)
# Beautiful Soup 需要lxml解析器
代码如下:
from bs4 import BeautifulSoup
soup=BeautifulSoup('<p>Hello</p>','lxml') # 自动更正(body/html节点)
print(soup.prettify()) # 将解析的字符串以标准的缩进格式输出
print(soup.p.string)
print(soup.title.string) # 输出title节点内容
# 提取信息
1) 获取名称 print(soup.title.name)
2) 获取属性 print(soup.title.attrs['name'])
3) 获取内容 print(soup.title.string)
# 嵌套选择 soup.head.title.string
# 关联选择
1) 直接字节点 soup.p.contents 等同于 soup.p.children
2) 所有子孙节点 soup.p.descendants,父节点 soup.p.parent 所有父节点soup.p.parents
3) 兄弟节点 soup.p.next_sibling(previous_sibling,next_siblings,previous_siblings)
# 方法选择器
1) find_all() find_all(name,attrs,recursive,text,**kwargs)
name find_all(soup.find_all(name='ul'))
attrs find_all(attrs={'id':'list-1'}))
text find_all(text=re.compile('link')))
2) find()
find_parents()find_next_siblings(),find_previous_siblings()
find_all_next(),find_all_previous()
CSS选择器 调用select()
soup.select('.panel .panel-heading')
soup.select('ul li')
soup.select('#list-2 .element')
# 选择
for ul in soup.select('ul'):
print(ul.select('li')) # 嵌套选择
print(ul['id']) # 获取属性
print(ul.get_text()) # 获取文本 同ul.string
# pyquery
基本代码:
from pyquery import PyQuery as pq # 引出PyQuery类
html="***"
doc =pq(html)
print(doc('li')) # 打印所有的li标签
1) url 初始化
from pyquery import PyQuery as pq
doc =pq(url='*') # 等同于 doc =pq(requests.get('*').text)
print(doc('title'))
2) 文件初始化
doc=pq(filename='demo.html')
3) 基于CSS选择器
doc=pq(html)
print(doc('#container .list li')) # 选取id为container的节点其内部class为list的li节点
4) 子节点
text=doc('.list').find('li') # 所有子孙节点
text=doc('.list').children() # 直接子节点
text=doc('.list').children('.active') # 直接子节点的active节点
5) 父节点
text=doc('.list').parent() # 直接父节点
text=doc('.list').parents() # 祖先节点
text=doc('.list').parents('.wrap') # 祖先节点的wrap节点
6) 兄弟节点
text=doc('.list').siblings('.active')
7) 获取属性
text=doc('.list').attr('href') # 等同于 a.attr.href
8) 节点操作
i addClass添加 removeClass删除
ii attr('name','link') 加属性name='link' text('change item') 加文本 同 html('*')
iii remove 移除 a.find('p').remove()
9) 伪类选择器
doc('li:first-child') # 第一个
doc('li:last-child') # 最后一个
doc('li:nth-child(2)') # 第二个li节点
doc('li:gt(2)') # 第三个li之后的li节点
doc('li:nth-child(2n)')# 偶数位置的li节点
doc('li:contains(second)') # 包含second文本的li节点
代码摘抄之《Python 3网络爬虫开发实战》
本文详细介绍了Python中BeautifulSoup库和PyQuery库的使用方法,包括解析HTML、节点选择、方法选择器、CSS选择器等。通过实例展示了如何提取页面元素、属性、文本,以及如何进行嵌套和关联选择。此外,还提及了PyQuery的基本操作,如URL初始化、CSS选择器和节点操作。这些内容对于理解和实现网页数据抓取至关重要。
664

被折叠的 条评论
为什么被折叠?



