使用requests库进行
import requests
target='https://www.csdn.net/'
req=requests.get(target)
print(req.text)
输出:
</head>
<body data-category="home" data-host_type="www">
<script id="toolbar-tpl-scriptId" prod="download" skin="black" src="//csdnimg.cn/public/common/toolbar/js/content_toolbar.js" type="text/javascript" domain="http://blog.csdn.net"></script>
<div class="container clearfix">
<nav id="nav" class="clearfix">
<div class="clearfix">
<div class="nav_com">
<ul>
<li class="active"><a href="/">推荐</a></li>
<li class=""><a href="/nav/watchers">关注</a></li>
<li class=""><a href="/nav/career">程序人生</a></li>
……
使用爬虫爬取csdn博客html文件
import requests
import re
import time
import numpy as np
# 获取指定链接的html内容
def getHtml(url):
while(True):
try:
res = requests.get(url, timeout=2, headers={'User-Agent': 'Baiduspider'})
break
except:
time.sleep(1)
encode = res.encoding
s = res.content
s.decode(encode)
return s
# 获取csdn目录页内部的各个blog的链接
def getURL(list_html):
begin = """<h4 class=\"\">
<a href=\""""
end = """\" target=\"_blank\">"""
r = r'(?<=' + begin + ').*(?=' + end + ')'
res = re.findall(r, list_html)
return res
# 文件形式保存html,url为本地保存地址
def saveFile(file, url):
fout = open(url, 'w', encoding='UTF-8')
fout.write(file)
fout.close()
def loadFile(url):
fread = open(url, 'r', encoding='utf-8')
file = fread.read()
fread.close()
return file
# 获取html文件的标题
def getTitle(html):
return re.search(r'(?<=<title>).*(?=_)', html)[0]
# 所有的博客链接
blog_urls = []
# 获取博客链接
def Init():
# 博客页数
page = 36
for index in range(1, page + 1, 1):
list_url = 'https://jkchen.blog.csdn.net/article/list/' + index.__str__()
list_html = getHtml(list_url)
blog_url_ar = getURL(list_html)
for url in blog_url_ar:
blog_urls.append(url)
np.save('blog_url.npy', blog_urls)
if __name__ == '__main__':
# 是否需要更新目录
refresh = False
if refresh:
Init()
# 是否需要保存html源文件(文件夹需要先创建好)
toSave = False
saveUrl = 'HTMLs/'
blog_urls = np.load('blog_url.npy')
epoch = 100
for T in range(epoch):
np.random.shuffle(blog_urls)
index = 0
for url in blog_urls:
index += 1
while(True):
try:
html = getHtml(url)
break
except:
print("Banned, and retry. ")
time.sleep(4)
title = getTitle(html)
if toSave:
saveFile(html, saveUrl + title + '.html')
print('epoch: {}, index: {}, title: {}'.format(T + 1, index, title))
time.sleep(10*np.random.rand())