完整代码如下:
# -*- coding: utf-8 -*-
#@author: Shinalone
import urllib
import time
url = ['']*350 #实际只有316篇
page = 1
link = 1
while page <= 7:
con = urllib.request.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read().decode('utf-8') #注意python3解码
i = 0
title = con.find(r'<a title=')
href = con.find(r'href=',title) #title为开头,缩小搜索范围
html = con.find(r'.html',href)
while title != -1 and href != -1 and html !=-1 and i < 350:
url[i] = con[href +6:html +5]
print(link,' ',url[i])
title = con.find(r'<a title=',html) #html为循环的点睛之笔
href = con.find(r'href=',title)
html = con.find(r'.html',href)
i = i + 1
link = link + 1
else:
print(page,'find end!')
page = page + 1
else:
print('all find end!')
#下载网页
j=0
while j < 350:
if url[j] == '':
continue
content = urllib.request.urlopen(url[j]).read().decode('utf-8')
open(r'hanhan/'+url[j][-26:],'w+',encoding='utf-8').write(content) #'w+'中'+'为“若无,就会创建新的”
print('downloading',url[j])
j = j + 1
time.sleep(4)
else:
print('download finished')

本文介绍了一个用于爬取新浪博客文章的Python脚本实现。该脚本能够从指定的博主页面抓取文章链接并下载对应的文章内容。通过循环遍历页面并解析HTML来定位每篇文章的链接,然后逐篇下载。

被折叠的 条评论
为什么被折叠?



