BS4_获取ZGRB-每日一词内容

LninarMing

已于 2023-02-06 15:05:10 修改

阅读量159

点赞数

分类专栏：额外提升文章标签：爬虫

于 2023-02-06 12:25:40 首次发布

本文链接：https://blog.csdn.net/u010005344/article/details/128899492

版权

额外提升专栏收录该内容

1 篇文章 0 订阅

订阅专栏

该代码示例展示了如何利用Python的requests库获取网页内容，然后使用BeautifulSoup解析HTML，提取文章标题、概述、知识点和重要信息，并将结果整合后写入文件。主要涉及网络爬虫和数据处理技术。

摘要由CSDN通过智能技术生成

获取某网站内容，为内容管理平台提供文章。

思路：

获取整体文章内容（requests），再进行处理（BeautifulSoup）以达到获取文章标题，获取新闻概述，获取【知识点】，【重要讲话】，【相关词汇】等内容，再根据内容要求，用指定格式进行拼接，最后写入文件。

main里运行内容：


    list_url =["https://language.chinadaily.com.cn/a/202211/04/WS6364d5bca3105ca1f22741dc.html", "https://language.chinadaily.com.cn/a/202211/03/WS63638569a310fd2b29e80256.html", "https://language.chinadaily.com.cn/a/202211/02/WS636233e8a310fd2b29e7fec5.html", "https://language.chinadaily.com.cn/a/202211/01/WS6360e13ca310fd2b29e7fb42.html", "https://language.chinadaily.com.cn/a/202210/31/WS635fbfe7a310fd2b29e7f829.html"]
    content = ''
    for url_page in list_url:
        text = get_html_text(url_page)
        content = content + get_title_data(text) + get_previous_data(text) + get_important_content(text) +'\n\n\n'
        print(content)
    generate_document_with_text(content)
    print('end')

1、获取网页内容

def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return "error"

2、获取文章标题

def get_title_data(content):
    soup = BeautifulSoup(content, 'html.parser')
    title = soup.h1.span.string
    index = title.find(' ')
    title = title[3:] + "\n" + title[index+1:] + "+++" + title[5:index]+"==="
    title = title.replace('∣', '：')
    return title

3、获取新闻概述

def get_previous_data(content):
    soup = BeautifulSoup(content, 'html.parser')
    # <p></p><figure>前的p里的内容
    previous_text = soup.find('figure').previous_siblings
    article = ''
    count = 0
    for sibling in previous_text:
        text_previous = sibling.getText().replace('\n', '')
        if len(text_previous) > 0:
            article = article + text_previous
            count = count+1
            if count % 2 == 0:
                article = article + "==="
            else:
                article = article + "+++"
    return article

4、获取其他内容

def get_important_content(content):
    soup = BeautifulSoup(content, 'html.parser')
    next_text = soup.find('figure').next_siblings
    article = ''
    tag = 0
    list_instruction = []
    list_word = []
    for sibling in next_text:
        text_next = sibling.getText().replace('\n', '').replace('\xa0', '')
        if len(text_next) > 0:
            if text_next.find('【知识点】') != -1:
                article = article + "【知识点】"+"\n"
                tag = 1
                continue
            elif text_next.find('【重要') != -1:
                article = article +text_next + "\n"
                tag = 2
                continue
            elif text_next.find('【相关词汇】') != -1:
                article = article + "【相关词汇】" + "\n"
                tag = 3
                continue
            if tag == 1:
                article = article + text_next + '\n'
            elif tag == 2:
                list_instruction.append(text_next)
                len_instruction = len(list_instruction)
                if len_instruction == 3:
                    # 需要分段
                    article = article + divideSegment(list_instruction) + '\n' + list_instruction[2] + '==='
                    list_instruction.clear()
            elif tag == 3:
                list_word.append(text_next)
                len_word = len(list_word)
                if len_word == 2:
                    article = article + list_word[1] + '+++' + list_word[0] + '==='
                    list_word.clear()
    return article

5、可能需要对段落进行分句

def divideSegment(list_article):
    list_eng = list_article[1].split('.')
    list_ch = list_article[0].split('。')
    len_eng = len(list_eng)
    len_ch = len(list_ch)
    segment = ''
    if len_eng != len_ch or len_eng <= 2 or len_ch <= 2:
        segment = str(list_article[1]) + "+++" + str(list_article[0])
    else:
        for i in range(len_eng-1):
            segment = segment + list_eng[i] + ".+++" + list_ch[i]
            if i < (len_eng-2):
                segment = segment + "。+++"
            else:
                segment = segment + "。"
    return segment

6、将内容写入文件

def generate_document_with_text(content):
    # 获取今日日期+title作为文件名
    # 写入text文件
    doc_path = 'E:\\Now_Works\\reading_two\\'
    i = datetime.datetime.now()
    date_name = str(i.month)+'.'+str(i.day)
    dir_path = doc_path + date_name+'新闻'
    text_path = dir_path + '\\'+date_name+'每日一词'
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    with open(text_path+'.txt', mode='a+', encoding='utf8') as article_file:
        article_file.write(content)