爬虫爬取人民网新闻

水0
已于 2022-01-25 00:43:56 修改
阅读量1.9k
点赞数 3
文章标签：爬虫 python
于 2022-01-25 00:42:55 首次发布
本文链接：https://blog.csdn.net/m0_62609328/article/details/122677868
版权
import requests#网页请求
import bs4#网页解析
import re#正则表达式
import os#cmd命令
import time

def strcmp(str1,str2):
    if  str2:#爬虫爬的数据是空不做比较
        if str1[:len(str2)].replace(' ','') == str2.replace(' ',''):
            return 1
        elif str2[:len(str1)].replace(' ','') == str1.replace(' ',''):
            return 1
    return 0

def print_article(soup):
    content = soup.find('div',class_='rm_txt_con cf')
    if content:
        for each in content:
            if each and (each.string != None):
                print(each.string)
        return 1
    content = soup.find_all('p', style='text-indent: 2em;')
    if content:
        for each in soup.find_all('p', style='text-indent: 2em;'):
            if each and (each.string != None):
                print(each.string)
            elif each.span and (each.string != None):
                print(each.span.string)
        return 1

def save_news(soup,newsname):#收藏新闻（新闻保存到本地）
    save = input("是否收藏该新闻？\n收藏请输入0以外任何字符\n不收藏请输入0")
    if save == '0':
        return 0
    path = 'E:/py爬虫/news/' + time.strftime("%Y%m%d") + newsname.replace('《','').replace('》','').replace('"','') + '.txt'
    file = open(path,'w',encoding='utf-8')
    content = soup.find('div', class_='rm_txt_con cf')
    if content:
        print('收藏成功')
        for each in content:
            if each and (each.string != None):
                file.write(each.string)
        file.close()
        return 1
    content = soup.find_all('p', style='text-indent: 2em;')
    if content:
        print('收藏成功')
        for each in soup.find_all('p', style='text-indent: 2em;'):
            if each and (each.string != None):
                file.write(each.string)
            elif each.span and (each.string != None):
                file.write(each.span.string)
        file.close()
        return 1
    print("收藏失败！")



def open_url(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
    res = requests.get(url, headers=headers)
    res.encoding = 'gbk'
    return res

def get_data(res):
    with open("test.txt","w",encoding="utf-8") as file:
        file.write(res.text)

def get_hotnews(soup):
    content = soup.find(id="hotnews")
    topline_target = content.find(id="rm_topline")   # 获取大标题要闻
    topread_target = content.find(id="rm_topread")   # 获取小标题要闻
    for link in topline_target.find_all('a'):
        print("大标题要闻链接",link.get('href'))#大标题要闻链接
    if topline_target.string is None:#如果标题是图片
        res = open_url(link.get('href'))
        topline_soup = bs4.BeautifulSoup(res.text, "html.parser")
        print("大标题要闻:",topline_soup.title.string)
        newsname = topline_soup.title.string
    else:#如果标题是字符串
        print("大标题要闻:",topline_target.string)#大标题要闻标题内容
        newsname = topline_target.string
    for link in topread_target.find_all('a'):
        print("小标题要闻链接:", link.get('href'))  # 小标题要闻链接
        print("小标题要闻:",link.string)#小标题要闻标题内容
    key = input('输入1访问大标题要闻\n输入2访问小标题要闻\n输入3退出该界面')
    if key == '1':
        for link in topline_target.find_all('a'):
            res = open_url(link.get('href'))
            topline_soup = bs4.BeautifulSoup(res.text, "html.parser")
            print_article(topline_soup)
            save_news(topline_soup,newsname)
            return 1
    elif key == '2':
        str = input("请输入你浏览的小新闻名称！")
        for link in topread_target.find_all('a'):
            if strcmp(str,link.string):
                res = open_url(link.get('href'))
                topread_soup = bs4.BeautifulSoup(res.text, "html.parser")
                print_article(topread_soup)
                newsname = link.string
                save_news(topread_soup,newsname)
                return 1
        print("*" * 30)
        print("您输入错误，请重试！")
        print("*" * 30)
        get_hotnews(soup)
    elif key == '3':
        return 0
    else:
        print("*" * 30)
        print("您输入错误，请重试！")
        print("*" * 30)
        get_hotnews(soup)

def get_imgnews(soup):
    content = soup.find_all("div",class_="swiper-slide")
    for each in content:
        if each.span:
            each.img_name = each.span.a.string.replace('\n', '')
            print(each.img_name)#replace防止html代码中间有换行
    str = input("请输入你浏览的新闻名称！\n输入0退出该界面")
    if str == '0':
        return 0
    for each in content:
        if strcmp(str,each.img_name):
            res = open_url(each.a.get('href'))
            newsname = each.img_name
            soup = bs4.BeautifulSoup(res.text, "lxml")
            print_article(soup)
            save_news(soup, newsname)
            return 0
    print("*" * 30)
    print("您输入错误，请重试！")
    print("*" * 30)
    get_imgnews(soup)

def get_sidebarnews(soup):
    content = soup.find(id="rm_aq")
    res1 = open_url(content.h2.a.get('href'))#图片标题
    soup1 = bs4.BeautifulSoup(res1.text, "html.parser")
    print(soup1.title.string)
    res2 = open_url(content.find('h2',class_ = "A6").a.get("href"))
    soup2 = bs4.BeautifulSoup(res2.text, "html.parser")
    print(soup2.title.string)
    for each in content.find_all('li'):
        if each.string != None:
            print(each.string)
    str = input("请输入你浏览的小新闻名称！\n输入0退出该界面")
    if str == '0':
        return 0
    for each in content.find_all('li'):
        if each.string != None:
            if strcmp(str,each.string):
                _url = each.a.get('href')
                newsname = each.string
                res = open_url(_url)
                soup = bs4.BeautifulSoup(res.text, "lxml")
                print_article(soup)
                save_news(soup, newsname)
                return 1
    if strcmp(str,soup1.title.string):
        _url = content.h2.a.get('href')
        newsname = soup1.title.string
        res = open_url(_url)
        soup = bs4.BeautifulSoup(res.text, "lxml")
        print_article(soup)
        save_news(soup, newsname)
        return 1
    if strcmp(str,soup2.title.string):
        _url = content.find('h2',class_ = "A6").a.get("href")
        newsname = soup2.title.string
        res = open_url(_url)
        soup = bs4.BeautifulSoup(res.text, "lxml")
        print_article(soup)
        save_news(soup, newsname)
        return 1
    print("*" * 30)
    print("您输入错误，请重试！")
    print("*" * 30)
    get_sidebarnews(soup)

def read_savenews():
    for each in os.listdir('E:/py爬虫/news/'):
        print(each.replace('.txt',''))
    if not os.listdir('E:/py爬虫/news/'):
        print("您暂未收藏任何新闻！")
    str = input("请输入你要阅读的收藏新闻！\n输入0退出该界面")
    if str == '0':
        return 0
    for each in os.listdir('E:/py爬虫/news/'):
        if str.replace(' ','') == each.replace('.txt','').replace(' ',''):
            path = 'E:/py爬虫/news/' + each
            file = open(path,'r',encoding="utf-8")
            print(file.read())
            file.close()
            return 0
    print("*" * 30)
    print("您输入错误，请重试！")
    print("*" * 30)
    read_savenews()


def make_file():
    if not os.path.exists('E:/py爬虫/news'):  # 判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs('E:/py爬虫1/news')  # makedirs 创建文件时如果路径不存在会创建这个路径

def main():
    url = "http://www.people.com.cn/"
    res = open_url(url)
    res.encoding = 'gbk'
    make_file()
    data = get_data(res)
    soup = bs4.BeautifulSoup(res.text, "lxml")
    print("欢迎使用人民网新闻自助阅读工具！\nmade by shui0")
    while 1:
        key = input("1:浏览标题新闻\n2:浏览动图新闻\n3:浏览边栏新闻\n4:浏览已收藏新闻\n5:退出程序\n请输入对应数字以选择对应功能！")
        if key == '1':
            get_hotnews(soup)
        elif key == '2':
            get_imgnews(soup)
        elif key == '3':
            get_sidebarnews(soup)
        elif key == '4':
            read_savenews()
        elif key == '5':
            exit(0)
        else:
            print("请输入正确的数字！")

if __name__ == "__main__":
    main()
初学爬虫，手写一个爬虫代码，供各位大佬参考借鉴，如有错误，还请指正。
(备注：此版并不为最终版，作者将逐渐更新此爬虫代码)