#encoding=utf8
import chardet
def str_detect(str):
try:
print ":".join("{:02x}".format(ord(c)) for c in str)
t = chardet.detect(str)
print t
if t['encoding']=="utf-8":
print str
except:
print type(str)
pass
print ""
def enc(str, enc):
try:
s = str.encode(enc)
str_detect(s)
except:
print "ERR:encode"
def dec(str, enc):
try:
s = str.decode(enc)
str_detect(s)
except:
print "ERR:decode"
pass
print "============================="
cn = "中文"
str_detect(cn)
cn1 = u"中文1"
str_detect(cn1)
enc(cn1, 'utf-8')
enc(cn1, 'utf-16')
enc(cn1, 'gb2312')
enc(cn1, "ISO-8859-1")
cn2 = "中文2"
str_detect(cn2)
dec(cn2, 'utf-8')
dec(cn1, 'utf-16')
dec(cn1, 'gb2312')
dec(cn1, "ISO-8859-1")
cn3 = "中文3"
dec_str = cn3.decode('utf-8')
str_detect(dec_str)
enc_str = dec_str.encode('utf-16')
str_detect(enc_str)
end_str = enc_str.decode('utf-16')
str_detect(end_str)
# str利用decode方法根据str的编码将其解码为unicode字符串类型
# str利用encode根据特定的编码将unicode字符串类型转换为特定的编码
# 注:系统不一样结果会不同
结果:
=============================
e4:b8:ad:e6:96:87
{'confidence': 0.7525, 'language': '', 'encoding': 'utf-8'}
中文
4e2d:6587:31
<type 'unicode'>
e4:b8:ad:e6:96:87:31
{'confidence': 0.7525, 'language': '', 'encoding': 'utf-8'}
中文1
ff:fe:2d:4e:87:65:31:00
{'confidence': 1.0, 'language': '', 'encoding': 'UTF-16'}
d6:d0:ce:c4:31
{'confidence': 0.682639754276994, 'language': 'Russian', 'encoding': 'KOI8-R'}
ERR:encode
e4:b8:ad:e6:96:87:32
{'confidence': 0.7525, 'language': '', 'encoding': 'utf-8'}
中文2
4e2d:6587:32
<type 'unicode'>
ERR:decode
ERR:decode
ERR:decode
4e2d:6587:33
<type 'unicode'>
ff:fe:2d:4e:87:65:33:00
{'confidence': 1.0, 'language': '', 'encoding': 'UTF-16'}
4e2d:6587:33
<type 'unicode'>