自动获取CSDN博客文章列表

最新推荐文章于 2024-06-14 13:40:39 发布

linux-c

最新推荐文章于 2024-06-14 13:40:39 发布

阅读量1.2k

点赞数

本文链接：https://blog.csdn.net/linux_complete/article/details/12614955

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

#!/usr/bin/python
# -*- coding: utf-8 -*-
import re,HTMLParser
import urllib2,os

addr_file='blog_addr.dat'
html_file="out.html"
diff_flag=0

def maenu(file_name):
    count=1
    addr = { "":""}
    try:
        fp=open(file_name, "r")
    except IOError:
        os.mknod(file_name)
    print "选择需要下载的链接:"
    for eachline in fp:
        if eachline[0] == '#':
            continue
        print "%d:%s" %(count,eachline)
        addr[count]=eachline
        count=count+1
    print "%d:新增一条blog地址" % (count)
    choice=raw_input('>>')
    fp.close()
    if re.findall(choice,'/bq|/bQ|/bquit|/bQUIT'):
        quit()
    try:
        if int(choice) == count :
            add_blog(file_name)
        else:
            return addr[int(choice)]
    except ValueError:
        print "输入有误,请重新选择!"

def add_blog(file_name):
    fp=open(file_name,"a")
    addr=raw_input('输入blog地址:')
    fp.write(addr)
    fp.write('\n')
    print "%s 已经增加到blog地址列表中!" % addr
    fp.close()

def is_set(var_name):
    try:
        type(eval(var_name))
    except :
        return 0
    else:
        return 1

class parseLinks(HTMLParser.HTMLParser):
    def handle_starttag(self, tag, attrs):
        if tag== 'a':
            for name,value in attrs:
                if name=='title':
                    result=re.findall("(\<.*?\>)",self.get_starttag_text())[0]
                    p=re.compile('次数|关注|私信')
                    if p.search(result) > 0:
                        pass
                    else:
                        begin=result.index("\"")
                        step1=result[begin+1:]
                        begin=step1.index("\"")
                        address=step1[:begin]
                        step2=step1[begin+1:]
                        begin=step2.index("\"")
                        step3=step2[begin+1:]
                        begin=step3.index("\"")
                        article=step3[:begin]
                        write_file.write("文章标题:%s\n文章地址:%s%s\n" %(article,blog_address,address))

def unload(common_url,file_name):
    outfile=open(file_name,'w')
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req = urllib2.Request(url=common_url, headers=headers)
    outfile.write(urllib2.urlopen(req).read())
    outfile.close()

def check_file(file_name):
    if os.path.isfile(file_name):
        return 1
    else:
        return 0

def prt_content(diff_flag,author):
    file1='%s.article' % author
    if diff_flag == 1:
        file2='%s.article.new' % author
        print "以下是更新内容："
        os.system("diff %s %s" % (file1,file2))
        os.system("mv %s %s" % (file2,file1))
    else:
        print open(file1,'r').read()


def get_blog_info(html_file):

    line=open(html_file,'r').read()
    
    username=re.findall("(var username.*?\;)",line,re.S)[0]
    blog_address=re.findall("(var blog_address.*?net)",line,re.S)[0]
    
    username=re.sub("\"|;|var|=| |username",'',username,10)
    blog_address=re.sub("\"|;|var|=| |blog_address",'',blog_address,10)
    return (line,username,blog_address)

if __name__ == '__main__':

    while (is_set('chaper_url') == 0) or (chaper_url is None):
        chaper_url=maenu(addr_file)

    print "正在下载文章列表...."
    unload(chaper_url,html_file)

    (line,username,blog_address)=get_blog_info(html_file)

    article_file="%s.article" % username
    
    if check_file(article_file)==0:
        pass
    else:
        article_file="%s.article.new" % username
        diff_flag=1
    
    write_file=open(article_file,'w')
    write_file.write("作者:%s\n" % username)
    
    lParser = parseLinks()
    lParser.feed(line)
    write_file.close()
    lParser.close()
    
    prt_content(diff_flag,username)

    os.remove(html_file)

发出来供大家批评指点！

说明：当被下载博客已经在本地存在，则会展示是否有新文章发表，有，则展示新文章的标题和链接。所有文章列表都在作者名称.article文件中存储！