谢了一个自动下载指定人的博客的脚本
这个脚本是用来下载csdn博客的
同样的方法可以下载一般其他网站的博客,如sina
有时页面访问会被拒绝,重新运行即可
这种程序是在分析了指定网站,我在这儿是csdn,之后编写出的
会牵涉到网页的编码问题,有时程序运行会因此终止
我自己的博客已经下载忘了
只是下载网页
使用网页分析后可以整理出文章,那样更实用
#
#blogdownloader_csdn.py
#@Author:onezeros@yahoo.cn ||Zhijie Lee
#I didnot realize the image_download function
#but it's not very difficult,so that you can do it by yourself
#cmd usage:blogdownloader_csdn.py blogname "full directory path"
#further extension :classify the articles to
#"原创""转载" etc.according to csdn
import os
import sys
import unicodedata
import urllib.request
#globle var
username=sys.argv[1]
#username='onezeros'
#total number of pages
total_num=0
dst_urls=[]
#use data as the file name
dst_title=[]
###########################################
#function to find urls of articals
#it's neccessory to verify whether the url exits
def url_finder(url_directory,firstpage=False):
global username,total_num,dst_urls,dst_title
url_f=urllib.request.urlopen(url_directory)
print("open url "+url_directory+" successfully/n")
url_front='/'+username+'/archive/'
for line in url_f.readlines():
lin=line.decode('utf-8')
pos_front=lin.find(url_front)
if(pos_front!=-1 ):
pos_post=len('2009/12/13/4998191.aspx')+len(url_front)+pos_front
if(lin[pos_post]=='#'):
dst_urls.append('http://blog.csdn.net'+lin[pos_front:pos_post])
s=lin[pos_front+len(url_front):pos_post-len('.aspx')]
s=s.replace('/','-')
print(s)
dst_title.append(s)
if(firstpage==True):
pos=lin.find('第1页')
if(pos!=-1):
pt=lin.find('页',pos+5)
total_num=int(lin[pos+5:pt])
if(firstpage==True and total_num==0):
print("something wrong when retriving imformation from first page")
sys.exit(1)
url_f.close()
return
################################################
def main():
#creat directory
global username,total_num,dst_urls,dst_title
if (False==os.path.isdir(sys.argv[2])):
print("the dirctory does not exit, do you want to creat it?/n")
while(1):
a=input('type y for yes or n for no/n')
if(a=='y' or a=='Y'):
os.makedirs(sys.argv[2])
print("successed to make dir "+sys.argv[2])
break
elif(a=='n' or a=='N'):
sys.exit(0)
os.chdir(sys.argv[2])
#os.chdir('d:/test')
#deal with first page
url_finder('http://blog.csdn.net/'+username,True)
#generate urls of every directory page
print(total_num)
for pagenum in range(2,total_num+1): #connot be excuted
url='http://blog.csdn.net/'+username+'/default.aspx?PageNumber='+str(pagenum)
print("openning "+url)
url_finder(url,False)
#abstract content of the article page
#!!!in csdn's sourcefile ,only users' articals has lebel'
'
#download the page only
if(len(dst_urls)!=len(dst_title)):
print('error happened')
sys.exit(0)
#create and store the files
for i in range(len(dst_urls)):
url_f=urllib.request.urlopen(dst_urls[i])
f=open(dst_title[i]+'.aspx','w')
for lin in url_f.readlines():
lin=lin.decode()
f.write(lin)
f.close()
print("Save "+dst_title[i]+'.aspx successfully')
print("/nDone/n");
if __name__ == '__main__':
main()