在windows下做开发的中国程序员,很多平时都会遇到一些国外开发者写的C/C++代码,其中经常会包含一些非标准ANSI字符集的字符(不少优秀的类库,开源项目都会遇到),通常这些源代码文件都不是unicode编码的,导致在中文代码页的windows系统下VC编译器会出现大量警告,有些特殊字符出现在常量字符串定义中的,还会影响程序运行的结果。解决办法有两种,一种是修改自己操作系统的字符代码页为源代码文件相符合的代码页(一般修改为拉丁字符集的cp1252或者1250即可),这种办法的优点就是容易管理,比如代码作者更新了代码,你可以方便比较修改;另一种办法就是将源码文件转换为一种unicode字符编码,通常是utf8)。本文的内容就是第二种方法。
之前写了个python脚本用来批量转换这些源代码文件为utf8编码保存,支持代码路径的嵌套,转换后的文件依然保持原始文件的目录结构,为了支持vc编辑器,所以要添加bom头,您可以按照您的需要对脚本做些修改即可,比如需要支持更多的文件后缀等,记录在此。
#!/usr/bin/env python
#coding:utf-8
# Author: Leo Cheng--<15483930@qq.com>
# Purpose:
# Created: 2011-7-28
import sys
import os
import codecs
"""
Usage: ConvertCp.py SrcDir DstDir
e.g.
if your source folder is "D:/Test/NonUnicode", destination folder is "D:/Test/utf8", just run command as follow :
python ConvertCp.py "D:/Test/NonUnicode" "D:/Test/utf8"
"""
codePageList = (\
"cp1252",
"cp1250",
"cp1251",
"cp1254",
"cp936",
"cp950",
"cp932",
"cp949",
"cp874",
"cp1253"
)
fileExtFilter = (\
".cpp",
".c",
".cxx",
".h",
".hpp",
".hxx"
)
def FileIsBomUtf8Encoding(filePath):
"""
Judge the file whether is Bom Utf_8.
"""
content = ""
try:
f = open(filePath, "rb")
try:
content = f.read()
finally:
f.close()
except IOError:
print "open file %s failed." % (os.path.basename(filePath))
return False
if content[0:3] == '\xef\xbb\xbf':
return True
return False
#----------------------------------------------------------------------
def ConvertFileEncoding(sourceFilePath, targetFilePath, targetEncoding = "utf_8"):
"""
Convert the text files from ANSI encoding into 'targetEncoding'(utf_8).
@param sourceFilePath source files path.
@param targetFilePath target files path.
@param targetEncoding target files encoding.
"""
#
# filter file ext.
#
(filePathname, filePathExt) = os.path.splitext(sourceFilePath)
if filePathExt.lower() not in fileExtFilter:
return False
#
# If the file is Bom utf_8 just skip.
#
if FileIsBomUtf8Encoding(sourceFilePath):
print "File \"%s\" is utf_8 format, not need convert." % (sourceFilePath)
return False
#
# Get the source content.
#
content = None
sourceEncoding = None
for cp in codePageList:
try:
sourceFile = codecs.open(sourceFilePath, mode = "r", encoding = cp)
content = sourceFile.read()
sourceEncoding = cp
break
except UnicodeDecodeError:
sourceFile.close()
content = None
continue
except IOError:
print "open file %s failed." % (os.path.basename(sourceFilePath))
return False
if content == None:
print "File \"%s\" is not valid encoding." % (sourceFilePath)
return False
#
# ensure the target directory exist.
#
targetPathDir = os.path.dirname(targetFilePath)
if not os.path.exists(targetPathDir):
os.makedirs(targetPathDir)
#
# convert the file content.
#
try:
targetFile = codecs.open(targetFilePath, mode = "w", encoding = targetEncoding)
try:
if targetEncoding.lower().startswith("utf") and targetEncoding.lower()[len(targetEncoding)-1] == "8":
targetFile.write(unicode( codecs.BOM_UTF8, "utf_8" ))
if content[0:3] == u'\xef\xbb\xbf':
content = content[3:]
targetFile.write(content)
except UnicodeDecodeError:
#
# skip the failure file.
#
print "convert file: \"%s\" failure" % (sourceFilePath)
sourceFile.close()
targetFile.close()
os.remove(targetFilePath)
return False
finally:
sourceFile.close()
targetFile.close()
except IOError:
print "open file %s failed." % (targetFilePath)
return False
print "convert file: \"%s\" from %s to %s successfully" % (os.path.basename(sourceFilePath), sourceEncoding, targetEncoding)
return True
if __name__=='__main__':
""""""
if len(sys.argv) <= 2:
print __doc__
sSourceDir = r"E:\TestSrc"
sTargetDir = r"E:\TestDst"
else:
sSourceDir = sys.argv[1]
sTargetDir = sys.argv[2]
for root, dirs, files in os.walk(sSourceDir):
for fileName in files:
sourcePath = os.path.join(root, fileName)
targetPath = sourcePath.replace(sSourceDir, sTargetDir)
ConvertFileEncoding(sourcePath, targetPath)