简单的正文抽取方法

最新推荐文章于 2022-04-14 17:26:26 发布

_JackSparrow

最新推荐文章于 2022-04-14 17:26:26 发布

阅读量330

点赞数

本文链接：https://blog.csdn.net/qq_42709514/article/details/84951355

版权

简单的正文抽取方法

import re
from bs4 import BeautifulSoup,Comment
import requests

authorset ={'责任编辑','作者'}

def filter_tags(html_str):
    soup =BeautifulSoup(html_str)
    title =soup.title.string.encode().decode('utf-8')
    # a = soup.findAll('script')[1]
    # print(a)
    #先把html里script，style给清理了
    [script.extract() for script in soup.findAll('script')]
    [style.extract() for style in soup.findAll('style')]

    # 把不属于HTML元素的垃圾数据清除掉
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]

    # prettify()函数，把代码格式给搞的标准一些： soup.prettify()
    # reg1.sub()使用正则把所有的标签和 属性替换掉,只保留标签中的部的数据
    reg1 = re.compile("<[^>]*>")
    content = reg1.sub('', soup.prettify()).split('\n')

    return title,content

def getcontent(lst,title,authorset, ret_html=False):
    # 所有文本数据的的长度

    list = [l.strip() for l in lst if not l.strip() is ""]
    lstlen = [len(x) for x in list]
    threshold=30
    startindex = 0
    # 最大长度文本的索引
    maxindex = lstlen.index(max(lstlen))
    endindex = 0
    for i,v in enumerate(lstlen[:maxindex]):
        if v> threshold and lstlen[i+1]>10 and lstlen[i+2]>10 and lstlen[i+3]>10:
            startindex = i
            break
    for i,v in enumerate(lstlen[maxindex:]):
        if v< threshold and lstlen[maxindex+i+1]<10 :#and lstlen[maxindex+i+2]<10 #and lstlen[maxindex+i+3]<10:
            endindex = i
            break
    if ret_html:
        content =['<p>'+x.strip()+'</p>' for x in list[startindex:endindex+maxindex] if len(x.strip())>0]
    else:
        content = [x.strip()+"\n" for x in list[startindex:endindex + maxindex] if len(x.strip()) > 0]
    return content

def run(ctthtml):

    title,content =filter_tags(ctthtml)
    newcontent =getcontent(content,title,authorset)
    ctt =''.join(newcontent)
    return title,ctt
if __name__ == '__main__':
    
    url = "https://www.thepaper.cn/newsDetail_forward_2722515"
    # url = "http://news.ifeng.com/a/20181210/60188059_0.shtml?_zbs_baidu_news"
    headers = {}
    cookies = {}
    resp = requests.get(url=url ,headers=headers, cookies=cookies, timeout=6)
    html = resp.content.decode("utf8")
    if resp.status_code is 200:
        title,ctt = run(ctthtml=html)
        print(title)
        print(ctt)

_JackSparrow

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
简单的正文抽取方法

简单的正文抽取方法import refrom bs4 import BeautifulSoup,Commentimport requestsauthorset ={'责任编辑','作者'}def filter_tags(html_str): soup =BeautifulSoup(html_str) title =soup.title.string.encode()....
复制链接

扫一扫