看视频,练习下下载博客
#-*-coding:utf-8 -*-
import urllib
import time
'''http://www.cppblog.com/woaidongmao/category/6007.html'''
"""<a id="CategoryEntryList1_EntryStoryList_Entries_ctl00_TitleUrl"
href="http://www.cppblog.com/woaidongmao/archive/2012/01/12/164064.html">"""
url=[" "]*50 #定义一个列表
h = "http://www.cppblog.com/woaidongmao/category/6007.html"
con = urllib.urlopen(h).read()
title = con.find('TitleUrl"')
href = con.find('href=',title)
html = con.find('.html',href)
i=0
while title!=-1 and href!=-1 and html!=-1 and i<50:
url[i]= con[href+6:html+5]
print '%d url:'%i, url[i]
title = con.find('TitleUrl"',html)
href = con.find('href=',title)
html = con.find('.html',href)
i=i+1
j = 0
while j <=3:
c = urllib.urlopen(url[j]).read()
open('D:/py_Script/0409/'+ url[j][-11:],'w').write(c)
time.sleep(15)
j = j+1
上面下载后保存是:.html文件
如何保存.txt文件,下面的是更新部分
#-*-coding:utf-8 -*-
import urllib
import time
import re
'''http://www.cppblog.com/woaidongmao/category/6007.html'''
"""<a id="CategoryEntryList1_EntryStoryList_Entries_ctl00_TitleUrl"
href="http://www.cppblog.com/woaidongmao/archive/2012/01/12/164064.html">"""
url=[" "]*40 #定义数列
h = "http://www.cppblog.com/woaidongmao/category/6007.html"
con = urllib.urlopen(h).read()
title = con.find('TitleUrl"')
href = con.find('href=',title)
html = con.find('.html',href)
i=0
while title!=-1 and href!=-1 and html!=-1 and i<40:
url[i]= con[href+6:html+5]
title = con.find('TitleUrl"',html)
href = con.find('href=',title)
html = con.find('.html',href)
print '%s'%i, url[i]
i=i+1
print "find finish"
j = 0
while j <=5:
c = urllib.urlopen(url[j]).read()
body = c.find(r'postbody">')
div = c.find(r'</div',body)
new_conment = c[body+10:div] #找到博文的内容部分
'''
<div class="postbody">
<p>以XML为例,需要做成独立的文件,如下:</p>
<p>1、人手改,程序只读入</p>
<p>2、程序读入,并写出的一类</p>
</div>
'''
p=re.compile('<[^>]+>')
new = p.sub("",new_conment) #剔除html语言部分
open('D:/Python_Data/download/'+ url[j][-11:-5]+'.txt','w').write(new)
time.sleep(15)
j = j+1
print "download end"