# -*- coding: utf-8 -*-
"""
@author: vincent
#/usr/bin/python
#http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html
"""
<a title="" target="_blank" href="http://blog.sina.com.cn/s/blog_4701280b0102eo83.html">《论电影的七个元素》——关于我对电…</a>
<a title="" target="_blank" href="http://blog.sina.com.cn/s/blog_4701280b0102eb8d.html">碎片</a></span>
"""
import urllib
mystr='<a title="" target="_blank" href="http://blog.sina.com.cn/s/blog_4701280b0102eo83.html">《论电影的七个元素》——关于我对电…</a>'
title=mystr.find(r'<a title=')
href=mystr.find(r'href="')
html=mystr.find(r'.html')
url=mystr[href+6:html+5]
f_start=url.find(r'blog_')
filename=url[f_start:]
print url
print filename
content=urllib.urlopen(url).read()
fobj=open(r'D:\sublime codes\python\web_crawler'+'\/'+filename,'w')
fobj.write(content)
fobj.close()
#print content
-------------------------------------------------------~~~~~~~~~~进一步~~~~~~------------------------------------------------------------------------------------------
# -*- coding: utf-8 -*-
import urllib
import time
url=['']*100
con=urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
#print con
title=con.find(r'<a title=')
href=con.find(r'href="',title)
html=con.find(r'.html',href)
length=0;
#url[0]=con[href+6:html+5]
for i in range(0,100):
if title !=-1 and href != -1 and html !=-1:
url[i]=con[href+6:html+5]
title=con.find(r'<a title=',html)
href=con.find(r'href="',title)
html=con.find(r'.html',href)
length+=1
else:
print "url find over~~~~"
break
url=url[0:length]
for j in range(0,50):
tmp=url[j]
print tmp
content=urllib.urlopen(tmp).read()
f_start=tmp.find(r'blog_')
f_name=tmp[f_start:]
fobj=open(r'D:\sublime codes\python\web_crawler'+'\/'+f_name,'w')
fobj.write(content)
fobj.close()
time.sleep(3)
-------------------------------------------------------~~~~~~~~~~最终的结果!!!~~~~~~----------------------------------------------------------------
# -*- coding: utf-8 -*-
import urllib
import time
url=['']*350
length=0;
for page in range(1,8):
con=urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
title=con.find(r'<a title=')
href=con.find(r'href="',title)
html=con.find(r'.html',href)
for i in range(0,100):
if title !=-1 and href != -1 and html !=-1:
url[length]=con[href+6:html+5]
title=con.find(r'<a title=',html)
href=con.find(r'href="',title)
html=con.find(r'.html',href)
length+=1
else:
break
print '-----get url end ~~~~-----'
url=url[0:length]
print 'length= ',str(length)
for j in range(0,length):
tmp=url[j]
print tmp
content=urllib.urlopen(tmp).read()
f_start=tmp.find(r'blog_')
f_name=tmp[f_start:]
fobj=open(r'D:\sublime codes\python\web_crawler'+'\/'+f_name,'w')
fobj.write(content)
fobj.close()
time.sleep(1)