# -*- coding: utf-8 -*-
"""
@author: vincent
"""
#/usr/bin/python
#http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html
import urllib
import time
url=['']*350
length=0;
for page in range(1,8):
con=urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
title=con.find(r'<a title=')
href=con.find(r'href="',title)
html=con.find(r'.html',href)
for i in range(0,100):
if title !=-1 and href != -1 and html !=-1:
url[length]=con[href+6:html+5]
title=con.find(r'<a title=',html)
href=con.find(r'href="',title)
html=con.find(r'.html',href)
length+=1
else:
break
print '-----get url end ~~~~-----'
url=url[0:length]
print 'length= ',str(length)
for j in range(0,length):
tmp=url[j]
print tmp
content=urllib.urlopen(tmp).read()
f_start=tmp.find(r'blog_')
f_name=tmp[f_start:]
fobj=open(r'D:\sublime codes\python\web_crawler'+'\/'+f_name,'w')
fobj.write(content)
fobj.close()
time.sleep(1)
韩寒的博客文章---爬虫3
最新推荐文章于 2022-09-22 12:30:08 发布