学习Python，自己写的一个离线转csdn博客的工具[未完善]-CSDN博客

本文链接：https://blog.csdn.net/ThomasLiu83/article/details/2066664

原理比较简单，就是先利用python发包自动登录CSDN，获得CSDN的登录cookie，在此基础上，向writeblog.csdn.net服务器发送博客正文，当然博客正文是分析别的网页得出，（请勿利用非法用途），我主要是方便将我以前的收藏文章转出来，方便我和大家的学习，所以本工具实例是将CSDN博客内收藏的文章转为发表的转载文章，标题以[z]结尾....算了，还是贴代码吧..实现中间有几点：

1）自动登录csdn帐号，需要输入验证码，我没有自动识别验证码，所以只是将验证码图片下载下来存为'a.jpeg'，所以还需要在开始登录的时候人眼识别输入

2）代码中用到“漂亮的汤”BeautifulSoup来解析web网页元素，挺不错的，所以需要自备BeautifulSoup包文件，另外还有一个十进制转其他进制的函数，是从某国外论坛上贴过来的，非常不错，我也贴出来，免得大家找麻烦

3）最重要的，很多异常处理没有做，时间比较紧，很多注释输出也没有关闭，大家想交流的，再email我吧，[thomasliu83 AT gmail.com]

下面先贴进制之间相互转换的代码：

# --coding:utf8--

# file:baseconvert.py

BASE2 = " 01 "

BASE10 = " 0123456789 "

BASE16 = " 0123456789ABCDEF "

BASE62 = " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz "

BASE36 = " 0123456789abcdefghijklmnopqrstuvwxyz "

def baseconvert(number,fromdigits,todigits):

""" converts a "number" between two bases of arbitrary digits

The input number is assumed to be a string of digits from the

fromdigits string (which is in order of smallest to largest

digit). The return value is a string of elements from todigits

(ordered in the same way). The input and output bases are

determined from the lengths of the digit strings. Negative

signs are passed through.

decimal to binary

>>> baseconvert(555,BASE10,BASE2)

'1000101011'

binary to decimal

>>> baseconvert('1000101011',BASE2,BASE10)

'555'

integer interpreted as binary and converted to decimal (!)

>>> baseconvert(1000101011,BASE2,BASE10)

'555'

base10 to base4

>>> baseconvert(99,BASE10,"0123")

'1203'

base4 to base5 (with alphabetic digits)

>>> baseconvert(1203,"0123","abcde")

'dee'

base5, alpha digits back to base 10

>>> baseconvert('dee',"abcde",BASE10)

'99'

decimal to a base that uses A-Z0-9a-z for its digits

>>> baseconvert(257938572394L,BASE10,BASE62)

'E78Lxik'

..convert back

>>> baseconvert('E78Lxik',BASE62,BASE10)

'257938572394'

binary to a base with words for digits (the function cannot convert this back)

>>> baseconvert('1101',BASE2,('Zero','One'))

'OneOneZeroOne'

"""

if str(number)[0] == ' - ' :

number = str(number)[ 1 :]

neg = 1

else :

neg = 0

# make an integer out of the number

x = long(0)

for digit in str(number):

x = x * len(fromdigits) + fromdigits.index(digit)

# create the result in base 'len(todigits)'

res = ""

while x > 0:

digit = x % len(todigits)

res = todigits[digit] + res

x /= len(todigits)

if neg:

res = " - " + res

return res

下面是主文件，mycookie.py[名字取的不好]

# --coding:utf8 --

# file: mycookie.py

import httplib

import urllib2,cookielib

import urllib,types

import BeautifulSoup

def GetContent(source,id,name):

""" 从html的source源代码中找出name对应的值 """

# 利用BeautifaulSoup解决问题

s = BeautifulSoup.BeautifulSoup(source)

result = s.find( ' input ' ,attrs = {id:name})

if result:

return result[ ' value ' ]

else :

return ''

def Process(opener,strlink,cookiejar,data = None,storefile = None):

""" 根据strlink请求，结合cookiejar和要发送的data构造http数据包 """

if opener == None and cookiejar == None and strlink == None:

print ' opener,strlink and cookiejar must be not null '

return None

# 构造请求

req = urllib2.Request(strlink) # 初步url

cookiejar.add_cookie_header(req) # 将此时cookie加入到请求头

req.add_header( ' User-Agent ' , ' Not IE 1.0 + Say Hello ' )

# 添加数据

if data and type(data) == types.DictType:

p_data = urllib.urlencode(data)

req.add_data(p_data) # 添加数据到http包中

else :

print ' Notice: No data additional '

# 执行链接

link = opener.open(req)

if storefile: # 如果需要存储网页

try :

file = open(storefile, ' wb ' )

file.write(link.read())

file.close()

except Exception,x:

print ' error: store file error, ' ,str(x), ' '

return None

else :

print ' Notice: No file stored '

return link

def GetPass():

""" 初始获得登录csdn的passport """

"""

req=urllib2.Request('http://passport.csdn.net/UserLogin.aspx?from=Passport.aspx')

cook.add_cookie_header(req) #向req请求中添加cookie

req.add_header('User-Agent','IE 7.9 + test')

#req.add_header('Connection','Keep-Alive')

link = opener.open(req)

content= link.read()

"""

link = Process(opener, ' http://passport.csdn.net/UserLogin.aspx?from=Passport.aspx ' ,cook)

if not link:

print ' error: link is None '

exit()

content = link.read()

# 解析__VIEWSTATE字符

strvalue = GetContent(content, ' name ' , ' __VIEWSTATE ' )

print strvalue

# 生成验证码字符串

import time

a = time.time()

astr = int(a * 1000 ). __str__ ()

print type(astr)

BASE36 = " 0123456789abcdefghijklmnopqrstuvwxyz "

BASE10 = " 0123456789 "

import baseconvert

s = baseconvert.baseconvert(astr,BASE10,BASE36)

print s

# 获取验证码的图片

"""

req=urllib2.Request() #获取请求

cook.add_cookie_header(req)

link=opener.open(req)

f=open('a.jpeg','wb')

f.write(link.read())

f.close()

"""

link = Process(opener, ' http://passport.csdn.net/ShowExPwd.aspx?temp= ' + s, cook, None, ' a.jpeg ' )

if not link:

print ' error: link is None '

exit()

# 输入验证码和帐号

user = raw_input( ' username: ' )

passwd = raw_input( ' password: ' )

code = raw_input( ' verifycode: ' )

private = []

# 此处为了应对pydev的控制台输入带字符'/r'

for s in [user,passwd,code]:

l = len(s)

if s[l - 1 ] == ' ' :

s = s[:l - 1 ]

private.append(s)

if len(private) != 3 :

print ' error: need 3 private data '

exit()

# 初始化发送的数据

postdata = { ' __EVENTTARGET ' : '' ,

' __EVENTARGUMENT ' : '' ,

' __VIEWSTATE ' :strvalue,

' tb_LoginNameOrLoginEmail ' :private[0],

' tb_Password ' :private[ 1 ],

' ClientKey ' : '' ,

' tb_ExPwd ' :private[ 2 ],

' from ' : ' . ' ,

' MailParameters ' : ' . ' ,

' Image_Login.x ' : ' 0 ' ,

' Image_Login.y ' : ' 0 ' }

# 发送域

postdata[ ' from ' ] = ' http://passport.csdn.net/UserLogin.aspx?from=/Passport.aspx '

# 特殊情况，特殊处理[获取passport]

req = urllib2.Request( ' http://passport.csdn.net/UserLogin.aspx?from=/Passport.aspx ' )

cook.add_cookie_header(req) # 填充头部的cookie

cookies = cook._cookies_for_request(req) # 从请求中获取cookies，[我用的这种方法，肯定还有其他方法]

# 从解析出的cookie中获取ClientKey

clientkey = ''

for cookie in cookies:

if cookie.name == ' ClientKey ' :

clientkey = cookie.value

break

# 填充域

postdata[ ' MailParameters ' ] = ' from= ' + postdata[ ' from ' ]

req.add_header( ' User-Agent ' , ' Not IE 1.0 + Say Hello ' )

# 数据转换

data = urllib.urlencode(postdata)

req.add_data(data) # 添加data

# print data

# 打开链接，即Post数据

link = opener.open(req)

# 如果此时链接打开完整，则可在cookie保存访问的帐号

print cook

def PostBlog(title,content,tags):

strlink = ' http://writeblog.csdn.net/PostEdit.aspx '

link = Process(opener, strlink, cook, None, None)

if not link: return

html = link.read()

view_state = GetContent(html, ' name ' , ' __VIEWSTATE ' )

event_validate = GetContent(html, ' name ' , ' __EVENTVALIDATION ' )

# 构造即将发送的数据

data = { ' __LASTFOCUS ' : '' ,

' __EVENTTARGET ' : '' ,

' __EVENTARGUMENT ' : '' ,

' __VIEWSTATE ' :view_state,

' ctl00$ContentPlaceHolder1$EntryEditor1$txbTitle ' :title + ' [z] ' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$richTextEditor$richTextEditor ' :content,

' ctl00$ContentPlaceHolder1$EntryEditor1$txbTags ' :tags,

' ctl00$ContentPlaceHolder1$EntryEditor1$txbExcerpt ' :content[: 500 ] + ' ... ' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$SaveButton ' : ' 发表文章 ' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$rblOri ' : ' copy ' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$GlobalCategoryList ' : '' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$ckbPublished ' : ' on ' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$chkDisplayHomePage ' : ' on ' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$chkIsAggregated ' : ' on ' ,

' ctl00$ContentPlaceHolder1$EntryEditor1$chkCopytoClipboard ' : ' on ' ,

' __EVENTVALIDATION ' :event_validate}

link = Process(opener, ' http://writeblog.csdn.net/PostEdit.aspx ' , cook, data, ' last.html ' )

if link:

print ' send ok '

else :

print ' send wrong '

def GetAndPostArticle(links,type = 2 ):

""" 从links中读取出相应的内容，转存到发表 """

if not links:

print ' No url link in links parameter '

return

__div_A = ' <div class="postText"> '

__div_B = ' </div> '

for link in links:

# 读取相应的内容

content = ''

title = ''

try :

htm = urllib.urlopen(link).read()

s = BeautifulSoup.BeautifulSoup(htm)

tmp = []

title = s.find( ' div ' ,attrs = { ' class ' : ' postTitle ' })

title = str(title.a.contents[ 1 ]) # 获取文字信息

title = title.replace( '   ' , '' )

content = s.find( ' div ' ,attrs = { ' class ' : ' postText ' })

print ' ---- '

"""

for c in content.contents:

try:

print type(c),'-',c,'-'

if type(c)!= types.StringType:

c = str(c)

tmp.append(c)

except Exception,x:

print 'error:',str(x),' '

pass

content=''.join(tmp)

"""

content = content. __str__ ()

con_len = len(content)

content = content[len( __div_A ):con_len - len( __div_B )]

# print content[:200]

tags = ' Linux,技术,IT,反汇编,调试,内存泄露,Win32,项目 '

PostBlog(title, content, tags)

print ' Article ' ,link, ' Success! ^_^ '

except Exception,x:

print ' Article ' ,link, ' Failed!!!!!! ' ,str(x), ' ..... '

pass

####### main entry ##########

import sys

reload(sys)

sys.setdefaultencoding( ' gb2312 ' )

# global data

# 创建一个urllib的httphandle来设置调试状态,为了便于调试，可以设置为1

h = urllib2.HTTPHandler(debuglevel = 0)

# 创建cookie保存jar

cook = cookielib.CookieJar()

cookie = urllib2.HTTPCookieProcessor(cook)

# 创建http开启

opener = urllib2.build_opener(cookie,h)

GetPass()

# 这些links只是做一个示例

links = [ ' http://blog.csdn.net/thomasliu83/articles/760339.aspx ' ,

' http://blog.csdn.net/thomasliu83/articles/740153.aspx ' ,

' http://blog.csdn.net/thomasliu83/articles/222986.aspx ' ,

' http://blog.csdn.net/thomasliu83/articles/222975.aspx ' ]

GetAndPostArticle(links, 1 )

""" #for test

title='my tesdfgsdfgsfdgdsfgt 1'

content='content slajf;ldsakjflkasdjfl;asjflka;sdjfl1'

tags='tags 1'

PostBlog(title,content,tags)

"""