# python 爬取<span></span>中间标签的内容
html = """
<div>
<span class='red'>item1</span>
<div>
<span id='s1'>item2</span>
</div>
</div>
"""
# 方法一:使用 scrapy 的Selector
from scrapy.selector import Selector
# scrapy 的选择器支持 css和xpath选择。下面是css选择器。如果你了解前端JQuery的知识,
# 会发现
t1 = Selector(text=html).css('span.red::text').extract() # class 用点
print(t1) # ['item1']
t2 = Selector(text=html).css('span::text').extract() # 所有span 的内容
print(t2) # ['item1','item2']
t3 = Selector(text=html).css('span#s1::text').extract() # id 用#
print(t3) # ['item2']
t4 = Selector(text=html).css('div>div>span::text').extract() # div 里边 span
print(t4) # ['item2']
# 方法二:使用bs4
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
s1 = soup.find('span', attrs={"class": "red"}) # 查找span class为red的字符串
s2 = soup.find_all("span") # 查找所有的span
result = [span.get_text() for span in s2]
print(result) # ['item1', 'item2']
1、正则表达式获取<td></td>标签之间的内容
如:<td class="label">行政相对人名称:</td> 获取 行政相对人名称:
Name= re.findall('<td class="label">(.*?)</tb>',text)[0]