#!/usr/bin/python
# -*- coding: utf-8 -*-
import re,HTMLParser
import urllib2,os
addr_file='blog_addr.dat'
html_file="out.html"
diff_flag=0
def maenu(file_name):
count=1
addr = { "":""}
try:
fp=open(file_name, "r")
except IOError:
os.mknod(file_name)
print "选择需要下载的链接:"
for eachline in fp:
if eachline[0] == '#':
continue
print "%d:%s" %(count,eachline)
addr[count]=eachline
count=count+1
print "%d:新增一条blog地址" % (count)
choice=raw_input('>>')
fp.close()
if re.findall(choice,'/bq|/bQ|/bquit|/bQUIT'):
quit()
try:
if int(choice) == count :
add_blog(file_name)
else:
return addr[int(choice)]
except ValueError:
print "输入有误,请重新选择!"
def add_blog(file_name):
fp=open(file_name,"a")
addr=raw_input('输入blog地址:')
fp.write(addr)
fp.write('\n')
print "%s 已经增加到blog地址列表中!" % addr
fp.close()
def is_set(var_name):
try:
type(eval(var_name))
except :
return 0
else:
return 1
class parseLinks(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if tag== 'a':
for name,value in attrs:
if name=='title':
result=re.findall("(\<.*?\>)",self.get_starttag_text())[0]
p=re.compile('次数|关注|私信')
if p.search(result) > 0:
pass
else:
begin=result.index("\"")
step1=result[begin+1:]
begin=step1.index("\"")
address=step1[:begin]
step2=step1[begin+1:]
begin=step2.index("\"")
step3=step2[begin+1:]
begin=step3.index("\"")
article=step3[:begin]
write_file.write("文章标题:%s\n文章地址:%s%s\n" %(article,blog_address,address))
def unload(common_url,file_name):
outfile=open(file_name,'w')
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = urllib2.Request(url=common_url, headers=headers)
outfile.write(urllib2.urlopen(req).read())
outfile.close()
def check_file(file_name):
if os.path.isfile(file_name):
return 1
else:
return 0
def prt_content(diff_flag,author):
file1='%s.article' % author
if diff_flag == 1:
file2='%s.article.new' % author
print "以下是更新内容:"
os.system("diff %s %s" % (file1,file2))
os.system("mv %s %s" % (file2,file1))
else:
print open(file1,'r').read()
def get_blog_info(html_file):
line=open(html_file,'r').read()
username=re.findall("(var username.*?\;)",line,re.S)[0]
blog_address=re.findall("(var blog_address.*?net)",line,re.S)[0]
username=re.sub("\"|;|var|=| |username",'',username,10)
blog_address=re.sub("\"|;|var|=| |blog_address",'',blog_address,10)
return (line,username,blog_address)
if __name__ == '__main__':
while (is_set('chaper_url') == 0) or (chaper_url is None):
chaper_url=maenu(addr_file)
print "正在下载文章列表...."
unload(chaper_url,html_file)
(line,username,blog_address)=get_blog_info(html_file)
article_file="%s.article" % username
if check_file(article_file)==0:
pass
else:
article_file="%s.article.new" % username
diff_flag=1
write_file=open(article_file,'w')
write_file.write("作者:%s\n" % username)
lParser = parseLinks()
lParser.feed(line)
write_file.close()
lParser.close()
prt_content(diff_flag,username)
os.remove(html_file)
发出来供大家批评指点!
说明:当被下载博客已经在本地存在,则会展示是否有新文章发表,有,则展示新文章的标题和链接。所有文章列表都在 作者名称.article文件中存储!