1. requests
requests
是一个用于简化 HTTP 请求的 Python 库。它是一个用户友好的库,提供了简洁的 API 来发送各种 HTTP 请求,如 GET、POST、PUT、DELETE 等,并处理响应。requests
被广泛使用于网络编程和数据爬取等场景。
用法:
1. 发送 GET 请求
import requests
response = requests.get('https://api.github.com')
# 打印状态码
print(response.status_code)
# 打印响应内容
print(response.text)
# 打印 JSON 格式的响应内容(如果有)
print(response.json())
2. 发送 POST 请求
import requests
data = {'key': 'value'}
response = requests.post('https://httpbin.org/post', data=data)
print(response.status_code)
print(response.text)
3. 发送带有参数的 GET 请求
import requests
params = {'q': 'requests+language:python'}
response = requests.get('https://api.github.com/search/repositories', params=params)
print(response.status_code)
print(response.json())
4. 发送带有 JSON 数据的 POST 请求
import requests
import json
data = {'name': 'test', 'description': 'test description'}
response = requests.post('https://httpbin.org/post', json=data)
print(response.status_code)
print(response.json())
5. 处理响应头
import requests
response = requests.get('https://api.github.com')
# 打印所有响应头
print(response.headers)
# 打印特定的响应头
print(response.headers.get('Content-Type'))
6. 错误处理
import requests
try:
response = requests.get('https://api.github.com/invalid-url')
response.raise_for_status() # 如果响应状态码不是 200,将抛出 HTTPError
except requests.exceptions.HTTPError as err:
print(f'HTTP error occurred: {err}')
except Exception as err:
print(f'Other error occurred: {err}')
7. 设置请求超时
import requests
try:
response = requests.get('https://api.github.com', timeout=5) # 设置超时时间为 5 秒
print(response.status_code)
except requests.exceptions.Timeout:
print('The request timed out')
8. 自定义请求头
import requests
headers = {'User-Agent': 'my-app'}
response = requests.get('https://api.github.com', headers=headers)
print(response.status_code)
print(response.json())
2. BeautifulSoup
BeautifulSoup
是一个用于解析和操作 HTML 和 XML 文档的 Python 库。它提供了简单的 API 来搜索和修改网页内容,非常适合用来进行网页抓取和数据提取。BeautifulSoup
主要用于从复杂的网页结构中提取数据,尤其是在处理不规则或含有大量 HTML 嵌套的网页时。
用法:
1. 解析 HTML 内容
from bs4 import BeautifulSoup
html_doc = """
<html>
<head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 打印格式化后的 HTML
print(soup.prettify())
2. 获取元素内容和属性
# 获取文本内容
text = soup.find('title').get_text()
print(text)
# 获取属性值
link = soup.find('a', id='link1')
print(link['href'])
3. 处理复杂的 HTML 结构
# 找到第一个 <p> 标签,并获取它的父节点
p_tag = soup.find('p')
parent = p_tag.parent
print(parent)
# 获取 <p> 标签下的所有子节点
for child in p_tag.children:
print(child)
3. lxml
lxml
是用于处理和解析 XML 和 HTML 文档。它基于 libxml2
和 libxslt
库,提供了高效、灵活且具有丰富功能的 API。lxml
既可以用来解析 HTML 和 XML 文档,也可以用于创建、修改和处理这些文档。
用法:
1. 解析 XML文档
from lxml import etree
xml_content = """
<root>
<child name="first">Value 1</child>
<child name="second">Value 2</child>
</root>
"""
# 解析 XML 文档
root = etree.fromstring(xml_content)
# 打印根元素的标签名
print(root.tag) # 输出: root
# 打印第一个子元素的文本内容
print(root[0].text) # 输出: Value 1
2. 使用 XPath 查询
from lxml import etree
xml_content = """
<root>
<child name="first">Value 1</child>
<child name="second">Value 2</child>
</root>
"""
root = etree.fromstring(xml_content)
# 使用 XPath 查找所有 <child> 元素
children = root.xpath('//child')
for child in children:
print(child.text)
# 查找具有特定属性值的元素
specific_child = root.xpath('//child[@name="second"]')[0]
print(specific_child.text) # 输出: Value 2
3. 创建和修改 XML文档
from lxml import etree
root = etree.Element("root")
child = etree.SubElement(root, "child")
child.text = "This is a child element"
# 打印格式化的 XML 字符串
print(etree.tostring(root, pretty_print=True).decode())
from lxml import etree
xml_content = """
<root>
<child name="first">Value 1</child>
</root>
"""
root = etree.fromstring(xml_content)
# 修改第一个 <child> 元素的文本
root.find('.//child').text = "New Value"
print(etree.tostring(root, pretty_print=True).decode())
4. 使用 XSLT 转换
from lxml import etree
# 原始 XML 内容
xml_content = """
<root>
<child name="first">Value 1</child>
<child name="second">Value 2</child>
</root>
"""
# XSLT 转换样式表
xslt_content = """
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="/">
<html>
<body>
<h2>Transformed XML</h2>
<ul>
<xsl:for-each select="root/child">
<li><xsl:value-of select="."/></li>
</xsl:for-each>
</ul>
</body>
</html>
</xsl:template>
</xsl:stylesheet>
"""
xml_doc = etree.fromstring(xml_content)
xslt_doc = etree.fromstring(xslt_content)
transform = etree.XSLT(xslt_doc)
result = transform(xml_doc)
print(str(result))