学习Python,自己写的一个离线转csdn博客的工具[未完善]

原理比较简单,就是先利用python发包自动登录CSDN,获得CSDN的登录cookie,在此基础上,向writeblog.csdn.net服务器发送博客正文,当然博客正文是分析别的网页得出,(请勿利用非法用途),我主要是方便将我以前的收藏文章转出来,方便我和大家的学习,所以本工具实例是将CSDN博客内收藏的文章 转为发表的转载文章,标题以[z]结尾....算了,还是贴代码吧..实现中间有几点:

1)自动登录csdn帐号,需要输入验证码,我没有自动识别验证码,所以只是将验证码图片下载下来存为'a.jpeg',所以还需要在开始登录的时候人眼识别输入

2)代码中用到“漂亮的汤”BeautifulSoup来解析web网页元素,挺不错的,所以需要自备BeautifulSoup包文件,另外还有一个十进制转其他进制的函数,是从某国外论坛上贴过来的,非常不错,我也贴出来,免得大家找麻烦

3)最重要的,很多异常处理没有做,时间比较紧,很多注释输出也没有关闭,大家想交流的,再email我吧,[thomasliu83 AT gmail.com]

下面先贴进制之间相互转换的代码:

# --coding:utf8--
#
file:baseconvert.py

BASE2 
=   " 01 "
BASE10 
=   " 0123456789 "
BASE16 
=   " 0123456789ABCDEF "
BASE62 
=   " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz "
BASE36 
=   " 0123456789abcdefghijklmnopqrstuvwxyz "

def  baseconvert(number,fromdigits,todigits):
    
"""  converts a "number" between two bases of arbitrary digits

    The input number is assumed to be a string of digits from the
    fromdigits string (which is in order of smallest to largest
    digit). The return value is a string of elements from todigits
    (ordered in the same way). The input and output bases are
    determined from the lengths of the digit strings. Negative 
    signs are passed through.

    decimal to binary
    >>> baseconvert(555,BASE10,BASE2)
    '1000101011'

    binary to decimal
    >>> baseconvert('1000101011',BASE2,BASE10)
    '555'

    integer interpreted as binary and converted to decimal (!)
    >>> baseconvert(1000101011,BASE2,BASE10)
    '555'

    base10 to base4
    >>> baseconvert(99,BASE10,"0123")
    '1203'

    base4 to base5 (with alphabetic digits)
    >>> baseconvert(1203,"0123","abcde")
    'dee'

    base5, alpha digits back to base 10
    >>> baseconvert('dee',"abcde",BASE10)
    '99'

    decimal to a base that uses A-Z0-9a-z for its digits
    >>> baseconvert(257938572394L,BASE10,BASE62)
    'E78Lxik'

    ..convert back
    >>> baseconvert('E78Lxik',BASE62,BASE10)
    '257938572394'

    binary to a base with words for digits (the function cannot convert this back)
    >>> baseconvert('1101',BASE2,('Zero','One'))
    'OneOneZeroOne'

    
"""

    
if  str(number)[0] == ' - ' :
        number 
=  str(number)[ 1 :]
        neg
= 1
    
else :
        neg
= 0

    
#  make an integer out of the number
    x = long(0)
    
for  digit  in  str(number):
       x 
=  x * len(fromdigits)  +  fromdigits.index(digit)
    
    
#  create the result in base 'len(todigits)'
    res = ""
    
while  x > 0:
        digit 
=  x  %  len(todigits)
        res 
=  todigits[digit]  +  res
        x 
/=  len(todigits)
    
if  neg:
        res 
=   " - " + res

    
return  res

下面是主文件,mycookie.py[名字取的不好]

 

# --coding:utf8 --
#
file: mycookie.py

import  httplib
import  urllib2,cookielib
import  urllib,types
import  BeautifulSoup

def  GetContent(source,id,name):
    
""" 从html的source源代码中找出name对应的值 """
    
# 利用BeautifaulSoup解决问题
    s  =  BeautifulSoup.BeautifulSoup(source)
    result 
=  s.find( ' input ' ,attrs = {id:name})
    
if  result:
        
return  result[ ' value ' ]
    
else :
        
return   ''          

def  Process(opener,strlink,cookiejar,data = None,storefile = None):
    
""" 根据strlink请求,结合cookiejar和要发送的data构造http数据包 """
    
if  opener == None  and  cookiejar == None  and  strlink == None:
        
print   ' opener,strlink and cookiejar must be not null '
        
return  None
    
    
# 构造请求
    req  =  urllib2.Request(strlink)  # 初步url
    cookiejar.add_cookie_header(req)  # 将此时cookie加入到请求头
    req.add_header( ' User-Agent ' , ' Not IE 1.0 + Say Hello ' )
    
    
# 添加数据
     if  data  and  type(data) == types.DictType:
        p_data
= urllib.urlencode(data)
        req.add_data(p_data) 
# 添加数据到http包中
     else :
        
print   ' Notice: No data additional '
        
    
# 执行链接
    link  =  opener.open(req)
    
if  storefile:  # 如果需要存储网页
         try :
            file
= open(storefile, ' wb ' )
            file.write(link.read())
            file.close()
        
except  Exception,x:
            
print   ' error: store file error, ' ,str(x), '   '
            
return  None
    
else :
        
print   ' Notice: No file stored '
    
    
return  link


def  GetPass():
    
""" 初始获得登录csdn的passport """
    
"""
    req=urllib2.Request('http://passport.csdn.net/UserLogin.aspx?from=Passport.aspx')
    cook.add_cookie_header(req) #向req请求中添加cookie
    req.add_header('User-Agent','IE 7.9 + test')
    #req.add_header('Connection','Keep-Alive')
    link = opener.open(req)
    content= link.read()
    
"""
    link 
=  Process(opener, ' http://passport.csdn.net/UserLogin.aspx?from=Passport.aspx ' ,cook)
    
if   not  link: 
        
print   ' error: link is None '
        exit()
    content 
=  link.read()
    
    
# 解析__VIEWSTATE字符
    strvalue = GetContent(content, ' name ' , ' __VIEWSTATE ' )
    
print  strvalue  
    
    
# 生成验证码字符串
     import  time
    a
= time.time()
    astr 
=  int(a * 1000 ). __str__ ()
    
    
print  type(astr)
    
    BASE36 
=   " 0123456789abcdefghijklmnopqrstuvwxyz "
    BASE10 
=   " 0123456789 "
    
    
import  baseconvert
    s 
=  baseconvert.baseconvert(astr,BASE10,BASE36)
    
print  s
    
    
# 获取验证码的图片
     """
    req=urllib2.Request() #获取请求
    cook.add_cookie_header(req)
    link=opener.open(req)
    f=open('a.jpeg','wb')
    f.write(link.read()) 
    f.close()
    
"""
    link
= Process(opener,  ' http://passport.csdn.net/ShowExPwd.aspx?temp= ' + s, cook, None,  ' a.jpeg ' )
    
if   not  link: 
        
print   ' error: link is None '
        exit()
    
    
# 输入验证码和帐号
    user = raw_input( ' username: ' )
    passwd
= raw_input( ' password: ' )
    code
= raw_input( ' verifycode: ' )
    
    private
= []
    
# 此处为了应对pydev的控制台输入带字符'/r'
     for  s  in  [user,passwd,code]:
        l
= len(s)
        
if  s[l - 1 ] == ' ' :
            s
= s[:l - 1 ]
        private.append(s)
        
    
if  len(private)  !=   3 :
        
print   ' error: need 3 private data '
        exit()
    
    
# 初始化发送的数据
    postdata = { ' __EVENTTARGET ' : '' ,
                           
' __EVENTARGUMENT ' : '' ,
                           
' __VIEWSTATE ' :strvalue,
                           
' tb_LoginNameOrLoginEmail ' :private[0],
                           
' tb_Password ' :private[ 1 ],
                           
' ClientKey ' : '' ,
                           
' tb_ExPwd ' :private[ 2 ],
                           
' from ' : ' . ' ,
                           
' MailParameters ' : ' . ' ,
                           
' Image_Login.x ' : ' 0 ' ,
                           
' Image_Login.y ' : ' 0 ' }
    
    
# 发送域
    postdata[ ' from ' ] = ' http://passport.csdn.net/UserLogin.aspx?from=/Passport.aspx '
    
    
# 特殊情况,特殊处理[获取passport]
    req = urllib2.Request( ' http://passport.csdn.net/UserLogin.aspx?from=/Passport.aspx ' )
    cook.add_cookie_header(req) 
# 填充头部的cookie
    cookies = cook._cookies_for_request(req)  # 从请求中获取cookies,[我用的这种方法,肯定还有其他方法]
    
    
# 从解析出的cookie中获取ClientKey
    clientkey = ''
    
for  cookie  in  cookies:
        
if  cookie.name == ' ClientKey ' :
            clientkey 
=  cookie.value
            
break
    
    
# 填充域
    postdata[ ' MailParameters ' ] = ' from= ' + postdata[ ' from ' ]
    
    req.add_header(
' User-Agent ' , ' Not IE 1.0 + Say Hello ' )
    
    
# 数据转换
    data = urllib.urlencode(postdata)
    req.add_data(data) 
# 添加data
     # print data
    
    
# 打开链接,即Post数据
    link  =  opener.open(req)
    
    
# 如果此时链接打开完整,则可在cookie保存访问的帐号
     print  cook
    
def  PostBlog(title,content,tags):
    strlink
= ' http://writeblog.csdn.net/PostEdit.aspx '
    link 
=  Process(opener, strlink, cook, None, None)
    
if   not  link:  return  
    
    html
= link.read()
    view_state
= GetContent(html, ' name ' , ' __VIEWSTATE ' )
    event_validate
= GetContent(html, ' name ' , ' __EVENTVALIDATION ' )
    
    
# 构造即将发送的数据
    data = { ' __LASTFOCUS ' : '' ,
        
' __EVENTTARGET ' : '' ,
        
' __EVENTARGUMENT ' : '' ,
        
' __VIEWSTATE ' :view_state,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$txbTitle ' :title + ' [z] ' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$richTextEditor$richTextEditor ' :content,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$txbTags ' :tags,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$txbExcerpt ' :content[: 500 ] + ' ... ' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$SaveButton ' : ' 发表文章 ' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$rblOri ' : ' copy ' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$GlobalCategoryList ' : '' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$ckbPublished ' : ' on ' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$chkDisplayHomePage ' : ' on ' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$chkIsAggregated ' : ' on ' ,
        
' ctl00$ContentPlaceHolder1$EntryEditor1$chkCopytoClipboard ' : ' on ' ,
        
' __EVENTVALIDATION ' :event_validate}
    
    link 
=  Process(opener, ' http://writeblog.csdn.net/PostEdit.aspx ' , cook, data, ' last.html ' )
    
if  link:
        
print   ' send ok '
    
else :
        
print   ' send wrong '
        

def  GetAndPostArticle(links,type = 2 ):
    
""" 从links中读取出相应的内容,转存到发表 """
    
if   not  links:
        
print   ' No url link in links parameter '
        
return
    
__div_A = ' <div class="postText"> '
    
__div_B = ' </div> '
    
    
for  link  in  links:
        
# 读取相应的内容
        content = ''
        title
= ''
        
try :
            htm
= urllib.urlopen(link).read()
            s
= BeautifulSoup.BeautifulSoup(htm)
            tmp
= []      
            title
= s.find( ' div ' ,attrs = { ' class ' : ' postTitle ' })   
            title 
=  str(title.a.contents[ 1 ]) # 获取文字信息
            title =  title.replace( ' &nbsp; ' '' )
            content 
=  s.find( ' div ' ,attrs = { ' class ' : ' postText ' })
            
print   ' ---- '
            
"""
            for c in content.contents:
                try:
                    print type(c),'-',c,'-'
                    if type(c)!= types.StringType:
                        c = str(c)
                    tmp.append(c)
                except Exception,x:
                    print 'error:',str(x),' '
                    pass
            content=''.join(tmp)
            
"""
            content
= content. __str__ ()
            con_len
= len(content)
            content
= content[len( __div_A ):con_len - len( __div_B )]
            
# print content[:200]
            tags = ' Linux,技术,IT,反汇编,调试,内存泄露,Win32,项目 '
            PostBlog(title, content, tags)
            
print   ' Article  ' ,link, ' Success! ^_^ '
        
except  Exception,x:
            
print   ' Article  ' ,link, ' Failed!!!!!! ' ,str(x), ' ..... '
            
pass
    
pass     


####### main entry ##########

import  sys
reload(sys)
sys.setdefaultencoding(
' gb2312 ' )

# global data
#
创建一个urllib的httphandle来设置调试状态,为了便于调试,可以设置为1
h = urllib2.HTTPHandler(debuglevel = 0)
# 创建cookie保存jar
cook = cookielib.CookieJar()
cookie
= urllib2.HTTPCookieProcessor(cook)
# 创建http开启
opener = urllib2.build_opener(cookie,h)

GetPass()

# 这些links只是做一个示例
links = [ ' http://blog.csdn.net/thomasliu83/articles/760339.aspx ' ,
' http://blog.csdn.net/thomasliu83/articles/740153.aspx ' ,
' http://blog.csdn.net/thomasliu83/articles/222986.aspx ' ,
' http://blog.csdn.net/thomasliu83/articles/222975.aspx ' ]

GetAndPostArticle(links,
1 )

""" #for test
title='my tesdfgsdfgsfdgdsfgt 1'
content='content slajf;ldsakjflkasdjfl;asjflka;sdjfl1'
tags='tags 1'

PostBlog(title,content,tags)
PostBlog(title,content,tags)
PostBlog(title,content,tags)
"""
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 19
    评论
评论 19
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值