python爬虫初探(CSDN文章爬取)

使用requests库进行

import requests

target='https://www.csdn.net/'
req=requests.get(target)
print(req.text)

输出:

</head>
<body data-category="home" data-host_type="www">
    <script id="toolbar-tpl-scriptId" prod="download" skin="black" src="//csdnimg.cn/public/common/toolbar/js/content_toolbar.js" type="text/javascript" domain="http://blog.csdn.net"></script>
    <div class="container clearfix">
      <nav id="nav" class="clearfix">
        <div class="clearfix">
        <div class="nav_com">
          <ul>
                  <li class="active"><a href="/">推荐</a></li>
                      <li class=""><a href="/nav/watchers">关注</a></li>
                      <li class=""><a href="/nav/career">程序人生</a></li>
……

使用爬虫爬取csdn博客html文件

import requests
import re
import time
import numpy as np


# 获取指定链接的html内容
def getHtml(url):
    while(True):
        try:
            res = requests.get(url, timeout=2, headers={'User-Agent': 'Baiduspider'})
            break
        except:
            time.sleep(1)
    encode = res.encoding
    s = res.content
    s.decode(encode)
    return s


# 获取csdn目录页内部的各个blog的链接
def getURL(list_html):
    begin = """<h4 class=\"\">
        <a href=\""""
    end = """\" target=\"_blank\">"""
    r = r'(?<=' + begin + ').*(?=' + end + ')'

    res = re.findall(r, list_html)
    return res


# 文件形式保存html,url为本地保存地址
def saveFile(file, url):
    fout = open(url, 'w', encoding='UTF-8')
    fout.write(file)
    fout.close()


def loadFile(url):
    fread = open(url, 'r', encoding='utf-8')
    file = fread.read()
    fread.close()
    return file


# 获取html文件的标题
def getTitle(html):
    return re.search(r'(?<=<title>).*(?=_)', html)[0]


# 所有的博客链接
blog_urls = []


# 获取博客链接
def Init():
    # 博客页数
    page = 36
    for index in range(1, page + 1, 1):
        list_url = 'https://jkchen.blog.csdn.net/article/list/' + index.__str__()
        list_html = getHtml(list_url)
        blog_url_ar = getURL(list_html)
        for url in blog_url_ar:
            blog_urls.append(url)
    np.save('blog_url.npy', blog_urls)


if __name__ == '__main__':
    # 是否需要更新目录
    refresh = False
    if refresh:
        Init()

    # 是否需要保存html源文件(文件夹需要先创建好)
    toSave = False
    saveUrl = 'HTMLs/'

    blog_urls = np.load('blog_url.npy')

    epoch = 100
    for T in range(epoch):
        np.random.shuffle(blog_urls)
        index = 0
        for url in blog_urls:
            index += 1
            while(True):
                try:
                    html = getHtml(url)
                    break
                except:
                    print("Banned, and retry. ")
                    time.sleep(4)
            title = getTitle(html)

            if toSave:
                saveFile(html, saveUrl + title + '.html')
            print('epoch: {}, index: {}, title: {}'.format(T + 1, index, title))
            time.sleep(10*np.random.rand())

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值