- 版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/kun1280437633/article/details/80643451
demo:
import re
from lxml import etree
str1='''
<a style="font-weight: bold" par="ssidkey=y&ss=201&ff=03&sg=61dddada6f3e4a62b688a786dac0a17f&so=1"
href="http://jobs.zhaopin.com/641207723250072.htm" target="_blank">销售<b>数据</b>专员</a>
'''
data = """
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html" class="aaaaa">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item<span>大家好</span>是吗</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
"""
# 方法一
dr = re.compile(r'<[^>]+>',re.S)
dd = dr.sub('',str1).strip()
# dd = re.sub(r'<[^>]+>','',str1).strip()
print(dd)
# 方法二
edata = etree.HTML(data)
dd = edata.xpath('//li[@class="item-1"]//text()')
print(dd)
edata2 = etree.HTML(str1)
dd = edata2.xpath('//a//text()')
for i in dd:
print(i, end='')
结果:
销售数据专员
['second item', 'fourth item', '大家好', '是吗']
销售数据专员