1.re模块本质属于对字符串的强制提取
解析方法有
html = """
哈哈
你真搞笑
"""
# 使用 scrapy 中的 Selector 进行解析
from scrapy import Selector
response = Selector(text=html)
text_list = response.css("div *::text").getall()
print([i for i in text_list if i.replace("\n", "")])
url = response.css("div img::attr(src)").get()
print(url)
"""
结果
['哈哈', '你真', '搞笑']
www.baidu.com
"""
# 使用 BeautifulSoup 进行解析
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
haha = soup.div.p.string
print(haha)
url = soup.img.attrs['src']
print(url)
nizhen = soup.div.get_text()[3:-3]
print(nizhen)
gaoxiao = soup.div.find_all('p')[-1].string
print(gaoxiao)
print(haha, url, nizhen, gaoxiao)
"""
结果
哈哈
www.baidu.com
你真
搞笑
哈哈 www.baidu.com 你真 搞笑
"""
# 使用lxml进行解析
from lxml import etree
html = etree.HTML(html) # 初始化生成一个XPath解析对象
text_lists = html.xpath("//div//text()")
print([i for i in text_lists if i.replace("\n", "")])
url = html.xpath("//div/p/img/@src")
print(url)
"""
结果
['哈哈', '你真', '搞笑']
['www.baidu.com']
"""
还可以解析的模块有
pyquery re 等等