简单的正文抽取方法
import re
from bs4 import BeautifulSoup,Comment
import requests
authorset ={'责任编辑','作者'}
def filter_tags(html_str):
soup =BeautifulSoup(html_str)
title =soup.title.string.encode().decode('utf-8')
# a = soup.findAll('script')[1]
# print(a)
#先把html里script,style给清理了
[script.extract() for script in soup.findAll('script')]
[style.extract() for style in soup.findAll('style')]
# 把不属于HTML元素的垃圾数据清除掉
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
# prettify()函数,把代码格式给搞的标准一些: soup.prettify()
# reg1.sub()使用正则把所有的标签和 属性替换掉,只保留标签中的部的数据
reg1 = re.compile("<[^>]*>")
content = reg1.sub('', soup.prettify()).split('\n')
return title,content
def getcontent(lst,title,authorset, ret_html=False):
# 所有文本数据的的长度
list = [l.strip() for l in lst if not l.strip() is ""]
lstlen = [len(x) for x in list]
threshold=30
startindex = 0
# 最大长度文本的索引
maxindex = lstlen.index(max(lstlen))
endindex = 0
for i,v in enumerate(lstlen[:maxindex]):
if v> threshold and lstlen[i+1]>10 and lstlen[i+2]>10 and lstlen[i+3]>10:
startindex = i
break
for i,v in enumerate(lstlen[maxindex:]):
if v< threshold and lstlen[maxindex+i+1]<10 :#and lstlen[maxindex+i+2]<10 #and lstlen[maxindex+i+3]<10:
endindex = i
break
if ret_html:
content =['<p>'+x.strip()+'</p>' for x in list[startindex:endindex+maxindex] if len(x.strip())>0]
else:
content = [x.strip()+"\n" for x in list[startindex:endindex + maxindex] if len(x.strip()) > 0]
return content
def run(ctthtml):
title,content =filter_tags(ctthtml)
newcontent =getcontent(content,title,authorset)
ctt =''.join(newcontent)
return title,ctt
if __name__ == '__main__':
url = "https://www.thepaper.cn/newsDetail_forward_2722515"
# url = "http://news.ifeng.com/a/20181210/60188059_0.shtml?_zbs_baidu_news"
headers = {}
cookies = {}
resp = requests.get(url=url ,headers=headers, cookies=cookies, timeout=6)
html = resp.content.decode("utf8")
if resp.status_code is 200:
title,ctt = run(ctthtml=html)
print(title)
print(ctt)