import urllib import time ##读取指定的网址 url = [] page = 1 while page <= 11: url_con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1193111400_0_'+str(page)+'.html').read() print 'con' ,url_con i = 0 title = url_con.find(r'<a title=') print "title",title href = url_con.find(r'href=',title) print "href",href html = url_con.find(r'.html',href) print "html",html while title != -1 and href != -1 and html != -1 and i < 40: url.append(url_con[href+6:html+5]) print page,url[i] title = url_con.find(r'<a title=',html) href = url_con.find(r'href=',title) html = url_con.find(r'.html',href) filename = url[-26:] i = i + 1 else: print page, 'find end' page = page + 1 else: print 'all find end !' j = 0 k = len(url) print "url sum:",k while j < k: content = urllib.urlopen(url[j]).read() filename = url[j][-26:] open(r'blog/'+ filename,'w').write(content) j = j + 1 time.sleep(5)
以上代码是获取所有博客文章列表,并读取其内容,并输出html