爬取博客文章保存到本地(失败)

用 Selenium实现自动化比较好…

import requests
from bs4 import BeautifulSoup
import re

header = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
article_info = {}


def get_html(url):
    r = requests.get(url=url, headers=header)
    if (r.status_code == 200):
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, 'html.parser')
        # print(soup)
        return soup
    else:
        print("请求网页失败")


def get_article_url(soup):
    try:
        article_list_div = soup.find('div', attrs={'class': 'article-list'})
        items = article_list_div.find_all('a', attrs={'target': '_blank'})
        for item in items:
            article_url = item['href']
            # 文章标题和原创标签混在一起,直接使用正则清洗
            # strip 清洗空格
            article_name = item.get_text().replace('原创', '').replace('转载', '').strip()
            article_info[article_name] = article_url
    except Exception as e:
        print(e)
        get_article_url(soup)


def save_article():
    for name, url in article_info.items():
        soup=get_html(url)
        for script in soup(["script", "style"]):
            #print(script)
            script.extract()
        # for link in soup(['link']):
        #     print(link)
        #     print("---------------")
        #     # link.extract()
        soup=str(soup).replace("csdn","")
        try:
            with open('D:/python_save/web/'+name+'.html', 'w',encoding='utf-8') as f:
                f.write(soup)
                f.close()
        except Exception as e:
            print(name+'.html'+"保存错误")
            print(e)

    # for name, url in article_info.items():
    #     #print(name, url)
    #     r = requests.get(url=url, headers=header)
    #     if (r.status_code == 200):
    #         r.encoding = 'utf-8'
    #         html=r.text
    #     try :
    #         with open('D:/python_save/web/'+name+'.html', 'w',encoding='utf-8') as f:
    #             f.write(html)
    #             f.close()
    #     except Exception as e:
    #         print(name+'.html'+"保存错误")
    #         print(e)


if __name__ == '__main__':
    for i in range(1, 3):
        url = 'https://blog.csdn.net/qq_43751489/article/list/' + str(i)
        get_article_url(get_html(url))
    save_article()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值