Python 处理各种编码的字符串

79 篇文章 0 订阅


# file: Unicode2.py
# -*- coding: utf-8 -*-
 
import chilkat
 
# The CkString object can handle any character encoding.
s1 = chilkat.CkString()
# The appendEnc method allows us to append a string in any encoding.
s1.appendEnc( 'èéêëabc' , 'utf-8' )
 
# If you're working with different encodings, you may wish
# to name your string variables to reflect the encoding.
strAnsi = s1.getAnsi()
strUtf8 = s1.getUtf8()
 
# Prints "7"
print len (strAnsi)
# Prints "11"
print len (strUtf8)
 
# getNumChars returns the number of characters
print 'Num Chars: ' + str (s1.getNumChars())
 
# utf-8 chars do not have a constant number of bytes/char.
# A single utf-8 char is represented in 1 to 6 bytes.
print 'utf-8: ' + str (s1.getSizeUtf8())
 
# ANSI is typically 1 byte per/char, but for some languages
# such as Japanese, ANSI equates to a character encoding that may
# not be 1 byte/char.  (Shift_JIS is the ANSI encoding for Japanese)
print 'ANSI: ' + str (s1.getSizeAnsi())
 
# Let's create an English/Japanese string.
s2 = chilkat.CkString()
s2.appendEnc( 'abc愛知県新城市の' , 'utf-8' )
 
# We can get the string in any multibyte encoding.
print 's2 num chars = ' + str (s2.getNumChars())
 
strShiftJIS = s2.getEnc( 'shift_JIS' )
print 'Shift-JIS num bytes = ' + str ( len (strShiftJIS))
 
strIso2022JP = s2.getEnc( 'iso-2022-jp' )
print 'iso-2022-jp num bytes = ' + str ( len (strIso2022JP))
 
strEucJp = s2.getEnc( 'euc-jp' )
print 'euc-jp num bytes = ' + str ( len (strEucJp))
 
# We can save the string in any encoding
s2.saveToFile( 'out_shift_jis.txt' , 'shift_JIS' )
s2.saveToFile( 'out_iso_2022_jp.txt' , 'iso-2022-jp' )
s2.saveToFile( 'out_utf8.txt' , 'utf-8' )
s2.saveToFile( 'out_euc_jp.txt' , 'euc-jp' )
 
# You may mix any number of languages in a utf-8 string
# because utf-8 can encode characters in all languages.
# (utf-8 is the multi-byte encoding of Unicode)
#
# An ANSI string can generally hold us-ascii + the native language.
# For example, Shift_JIS can represent us-ascii characters
# in addition to Japanese characters.
# For example, this is OK
strShiftJis = 'abc123' + s2.getEnc( 'shift_JIS' )
 
# This is not OK:
strShiftJis2 = 'ςστυφ' + s2.getEnc( 'shift_JIS' )
 
print "Done!"
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值