# -*- coding: utf-8 -*-
"""
Created on Sun Jul 10 16:16:43 2016
@author: vincent
"""
#/usr/bin/python
#http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html
import urllib
import time
url=['']*100
con=urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
#print con
title=con.find(r'<a title=')
href=con.find(r'href="',title)
html=con.find(r'.html',href)
length=0;
#url[0]=con[href+6:html+5]
for i in range(0,100):
if title !=-1 and href != -1 and html !=-1:
url[i]=con[href+6:html+5]
title=con.find(r'<a title=',html)
href=con.find(r'href="',title)
html=con.find(r'.html',href)
length+=1
else:
print "url find over~~~~"
break
url=url[0:length]
for j in range(0,50):
tmp=url[j]
print tmp
content=urllib.urlopen(tmp).read()
f_start=tmp.find(r'blog_')
f_name=tmp[f_start:]
fobj=open(r'D:\sublime codes\python\web_crawler'+'\/'+f_name,'w')
fobj.write(content)
fobj.close()
time.sleep(3)
韩寒的博客文章---爬虫2
最新推荐文章于 2018-01-18 09:43:54 发布