python爬虫初探（CSDN文章爬取）

最新推荐文章于 2025-04-03 15:29:07 发布

JK Chen

最新推荐文章于 2025-04-03 15:29:07 发布

阅读量6.7k

点赞数 12

分类专栏：爬虫 python All

本文链接：https://blog.csdn.net/jk_chen_acmer/article/details/102886416

版权

All 同时被 3 个专栏收录

629 篇文章

订阅专栏

python

22 篇文章

订阅专栏

爬虫

4 篇文章

订阅专栏

使用requests库进行

import requests

target='https://www.csdn.net/'
req=requests.get(target)
print(req.text)

输出：

</head>
<body data-category="home" data-host_type="www">
    <script id="toolbar-tpl-scriptId" prod="download" skin="black" src="//csdnimg.cn/public/common/toolbar/js/content_toolbar.js" type="text/javascript" domain="http://blog.csdn.net"></script>
    <div class="container clearfix">
      <nav id="nav" class="clearfix">
        <div class="clearfix">
        <div class="nav_com">
          <ul>
                  <li class="active"><a href="/">推荐</a></li>
                      <li class=""><a href="/nav/watchers">关注</a></li>
                      <li class=""><a href="/nav/career">程序人生</a></li>
……

使用爬虫爬取csdn博客html文件

import requests
import re
import time
import numpy as np


# 获取指定链接的html内容
def getHtml(url):
    while(True):
        try:
            res = requests.get(url, timeout=2, headers={'User-Agent': 'Baiduspider'})
            break
        except:
            time.sleep(1)
    encode = res.encoding
    s = res.content
    s.decode(encode)
    return s


# 获取csdn目录页内部的各个blog的链接
def getURL(list_html):
    begin = """<h4 class=\"\">
        <a href=\""""
    end = """\" target=\"_blank\">"""
    r = r'(?<=' + begin + ').*(?=' + end + ')'

    res = re.findall(r, list_html)
    return res


# 文件形式保存html，url为本地保存地址
def saveFile(file, url):
    fout = open(url, 'w', encoding='UTF-8')
    fout.write(file)
    fout.close()


def loadFile(url):
    fread = open(url, 'r', encoding='utf-8')
    file = fread.read()
    fread.close()
    return file


# 获取html文件的标题
def getTitle(html):
    return re.search(r'(?<=<title>).*(?=_)', html)[0]


# 所有的博客链接
blog_urls = []


# 获取博客链接
def Init():
    # 博客页数
    page = 36
    for index in range(1, page + 1, 1):
        list_url = 'https://jkchen.blog.csdn.net/article/list/' + index.__str__()
        list_html = getHtml(list_url)
        blog_url_ar = getURL(list_html)
        for url in blog_url_ar:
            blog_urls.append(url)
    np.save('blog_url.npy', blog_urls)


if __name__ == '__main__':
    # 是否需要更新目录
    refresh = False
    if refresh:
        Init()

    # 是否需要保存html源文件（文件夹需要先创建好）
    toSave = False
    saveUrl = 'HTMLs/'

    blog_urls = np.load('blog_url.npy')

    epoch = 100
    for T in range(epoch):
        np.random.shuffle(blog_urls)
        index = 0
        for url in blog_urls:
            index += 1
            while(True):
                try:
                    html = getHtml(url)
                    break
                except:
                    print("Banned, and retry. ")
                    time.sleep(4)
            title = getTitle(html)

            if toSave:
                saveFile(html, saveUrl + title + '.html')
            print('epoch: {}, index: {}, title: {}'.format(T + 1, index, title))
            time.sleep(10*np.random.rand())