爬取人民日报评论版面文章

最新推荐文章于 2025-02-17 11:02:46 发布

liu_96

最新推荐文章于 2025-02-17 11:02:46 发布

阅读量856

点赞数 5

文章标签：爬虫 python 开发语言

本文链接：https://blog.csdn.net/liu_96/article/details/121727260

版权

因为在准备公务员考试，想着把人民日报评论版面看看，奈何这个app不好用，遂有了自己爬虫的念想，在csdn上搜索了下，这里记录下，感谢CSDN博主「机灵鹤」的原创文章。
————————————————
版权声明：本文为CSDN博主「机灵鹤」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/wenxuhonghe/article/details/90047081

本文在原博主的基础上略作修改，只爬取评论版面的文章，并且是时间输入是当前日期往前推30天。这里把代码上传，供大家参考。

#!/usr/bin/env python
# coding: utf-8
import requests
import bs4
import os
import datetime
import time

def fetchUrl(url):
    '''
    功能：访问 url 的网页，获取网页内容并返回
    参数：目标网页的 url
    返回：目标网页的 html 内容
    '''
    
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    
    r = requests.get(url,headers=headers)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def getNewsList(year, month, day):
    '''
    功能：获取当天报纸的各版面的链接列表
    参数：年，月，日
    '''
    url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/nbs.D110000renmrb_01.htm'
    html = fetchUrl(url)
    bsobj = bs4.BeautifulSoup(html,'html.parser')
    PageList = bsobj.find('div', attrs = {'class':'swiper-container'})
    for PageName in PageList.find_all('a'):
        if PageName.text[4:] == '评论':
            url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/nbs.D110000renmrb_'+PageName.text[:2]+'.htm'
            html = fetchUrl(url)
            bsobj = bs4.BeautifulSoup(html,'html.parser')

    NewsWant = bsobj.find('div', attrs = {'class': 'paper-bot'})
    if NewsWant.text[6:8] != '评论':
        return None
    temp = bsobj.find('div', attrs = {'class': 'news'})

    titleList = temp.ul.find_all('li')
    linkList = []
    for title in titleList:
        tempList = title.find_all('a')
        for temp in tempList:
            link = temp["href"]
            if 'nw.D110000renmrb' in link:
                url = 'http://paper.people.com.cn/rmrb/html/'  + year + '-' + month + '/' + day + '/' + link
                linkList.append(url)
    
    return linkList

def getContent(html):
    '''
    功能：解析 HTML 网页，获取新闻的文章内容
    参数：html 网页内容
    '''    
    bsobj = bs4.BeautifulSoup(html,'html.parser')
    
    # 获取文章 标题
    title = bsobj.h3.text + '\n' + bsobj.h1.text + '\n' + bsobj.h2.text + '\n'
    #print(title)
    
    # 获取文章 内容
    pList = bsobj.find('div', attrs = {'id': 'ozoom'}).find_all('p')
    content = ''
    for p in pList:
        content += p.text + '\n'      
    #print(content)
    
    # 返回结果 标题+内容
    resp = title + content
    return resp
    
def saveFile(content, path, filename):
    '''
    功能：将文章内容 content 保存到本地文件中
    参数：要保存的内容，路径，文件名
    '''
    # 如果没有该文件夹，则自动生成
    if not os.path.exists(path):
        os.makedirs(path)
        
    # 保存文件
    with open(path + filename, 'w', encoding='utf-8') as f:
        f.write(content)

def download_rmrb(year, month, day, destdir):
    '''
    功能：爬取《人民日报》网站 某年 某月 某日 的新闻内容，并保存在 指定目录下
    参数：年，月，日，文件保存的根目录
    '''
    huizongContent = ' '
    titleList = getNewsList(year, month, day)
    if not titleList:
        print('当天无评论栏')
        return
    for url in titleList:
        
        # 获取新闻文章内容
        html = fetchUrl(url)
        content = getContent(html)
        
        # 生成保存的文件路径及文件名
        temp = url.split('_')[2].split('.')[0].split('-')
        pageNo = temp[1]
        titleNo = temp[0] if int(temp[0]) >= 10 else '0' + temp[0]
    
        huizongContent = huizongContent + content + '\n\n'
    path = destdir + '/news/'
    fileName = year + month + day + '-' + pageNo + '.txt'
        
    # 保存文件
    saveFile(huizongContent, path, fileName)




if __name__ == '__main__':
    '''
    主函数：程序入口
    '''
    today = datetime.date.today()
    daylen = 30
    destdir = "D:/data"

    for i in range(daylen):
        daydate = (today + datetime.timedelta(days=-(i+1))).strftime("%Y%m%d")
        year = daydate[:4]
        month = daydate[4:6]
        day = daydate[6:8]
    
        download_rmrb(year, month, day, destdir)
        print("爬取完成：" + year + month + day)

如果对大家有帮助，欢迎点赞。