用python抓取韩寒的新浪博客的代码:
#<a title="地震思考录" target="_blank" href="http://blog.sina.com.cn/s/blog_4701280b0102egl0.html">地震思考录</a>
#coding:utf-8
import urllib
import time
url=['']*6
con=urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
i=0
title=con.find(r'<a title=')
href=con.find(r'href=',title)
html=con.find(r'.html',href)
while title!=-1 and href!=-1 and html!=-1 and i<6:
url[i]=con[href+6:html+5]
print url[i]
title=con.find(r'<a title=',html)
href=con.find(r'href=',title)
html=con.find(r'.html',href)
i=i+1
else:
print 'end of find!'
j=0
while j<6:
content=urllib.urlopen(url[j]).read()
open(r'hanhan/'+url[j][-26:],'w+').write(content)
print'downloading',url[j]
j=j+1
time.sleep(15)
else:
print 'download finished'