获取文字加表情(alt标签的属性)
#!/usr/bin/env python
# encoding: utf-8
from functools import reduce
from lxml import html
from bs4 import BeautifulSoup
html="""
<div><span class="url-icon"><img alt="[馋嘴]" src="//h5.sinaimg.cn/m/emoticon/icon/default/d_chanzui-ad3f4f182c.png" style="width:1em; height:1em;"/></span>听着就很好吃</div>
"""
def main():
bs=BeautifulSoup(html,'html.parser')
main_div=bs.find('div')
contents=parse_div(main_div)
print(contents)
def parse_div(div_tags):
contents=div_tags.contents
result=[]
for content in contents:
if isinstance(content,str):
content=content.replace('\n','').replace(' ','')
result.append(content)
elif content.has_attr('alt'):
result.append(content.get('alt',''))
else:
new_contents=parse_div(content)
result.append(new_contents)
return ''.join(result)
#最优解
def main(self, htmlstr):
root = html.fromstring(htmlstr)
nodes = root.xpath(".//text()|.//@alt")
return ''.join([i.replace('\n','').replace(" ", "").replace("\u200b", "") for i in nodes])
if __name__ == '__main__':
main()