c++ uint32转为int_【转】用python将GBK编码文件转为UTF-8编码文件-CSDN博客

学习资料：

字符集与字符编码 - 网易云课堂study.163.com

有的时候我们有一些网页的项目，需要用到JavaScript读取一些文本文件，用以读取数据；但各种文本文件的编码方式不尽相同，特别是带有中文字符的文件，为GBK编码，一般加载后都会出现乱码情况，故需要在加载之前将文件的编码形式转为国际兼容的编码方式UTF-8。乱码也是一个很烦的问题，博主苦寻良久，终于找到了相应的解决方案，这个python程序对单个文件或者整个文件夹下的文件进行批量转码操作，经过实例测试，代码有效，代码中文件类型是自己设置的，本文文件格式为"cfg"，可根据项目需要在程序内修改文件格式，程序代码如下：字符集与字符编码 - 网易云课堂有的时候我们有一些网页的项目，需要用到JavaScript读取一些文本文件，用以读取数据；但各种文本文件的编码方式不尽相同，特别是带有中文字符的文件，为GBK编码，一般加载后都会出现乱码情况，故需要在加载之前将文件的编码形式转为国际兼容的编码方式UTF-8。乱码也是一个很烦的问题，博主苦寻良久，终于找到了相应的解决方案，这个python程序对单个文件或者整个文件夹下的文件进行批量转码操作，经过实例测试，代码有效，代码中文件类型是自己设置的，本文文件格式为"cfg"，可根据项目需要在程序内修改文件格式，程序代码如下：

gbk2utf.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = ''
import logging, os, argparse, textwrap
import time
import chardet
# Default configuration will take effect when corresponding input args are missing.
# Feel free to change this for your convenience.
DEFAULT_CONF = {
# Only those files ending with extensions in this list will be scanned or converted.
'exts' : ['cfg'],
'overwrite' : False,
'add_BOM' : False,
'convert_UTF' : False,
'confi_thres' : 0.8,
}
# We have to set a minimum threshold. Only those target_encoding results returned by chartdet that are above that threshold level would be accepted.
# See https://github.com/x1angli/convert2utf/issues/4 for further details
logging.basicConfig(format='%(levelname)s:%(message)s', level=http://logging.INFO)
log = logging.getLogger(__name__)
class Convert2Utf8:
def __init__(self, args):
self.args = args
def walk_dir(self, dirname):
for root, dirs, files in os.walk(dirname):
for name in files:
extension = os.path.splitext(name)[1][1:].strip().lower()
# On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the 'n'
# Also, add 'lower()' to ensure matching
if (extension in self.args.exts):
fullname = os.path.join(root, name)
try:
self.convert_file(fullname)
except IOError:
log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
except KeyboardInterrupt:
log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
exit()
# else:
# log.error("Unable to process the file: %s. Please check.", fullname)
# traceback.print_stack()
def convert_file(self, filename):
with open(filename, 'rb') as f: # read under the binary mode
bytedata = f.read()
if len(bytedata) == 0:
http://log.info("Skipped empty file %s", filename)
return
chr_res = chardet.detect(bytedata)
if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
log.warning("Ignoring %s, since its encoding is unable to detect.", filename)
return
src_enc = chr_res['encoding'].lower()
log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)
if (src_enc == 'ascii'):
http://log.info("Skipped %s, whose encoding is %s", filename, src_enc)
return
if (not self.args.convert_utf) and src_enc.startswith('utf'):
http://log.info("Skipped %s, whose encoding is %s", filename, src_enc)
return
# Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
# contains certain special charaters. To make it more special-character-tolerant, we should
# upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
if src_enc.lower() == 'gb2312':
src_enc = 'gb18030'
try:
strdata = bytedata.decode(src_enc)
except UnicodeDecodeError as e:
log.error("Unicode error for file %s", filename)
print(e)
return
# preserving file time information (modification time and access time)
src_stat = os.stat(filename)
# if the 'overwrite' flag is 'False', we would make a backup of the original text file.
if not self.args.overwrite:
backup_name = filename + '.' + str(int(round(time.time() * 1000))) + '.bak'
http://log.info("Renaming %s to %s", filename, backup_name)
os.rename(filename, backup_name)
tgt_enc = self.args.target_encoding
log.debug("Writing the file: %s in %s", filename, tgt_enc)
with open(filename, 'wb') as f: # write under the binary mode
f.write(strdata.encode(tgt_enc))
http://log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
# setting the new file's time to the old file
os.utime(filename, times = (src_stat.st_atime, src_stat.st_ctime))
# end of def convert_file(self, filename)
def run(self):
root = self.args.root
if not os.path.exists(root):
log.error("The file specified %s is neither a directory nor a regular file", root)
return
http://log.info("Start working now!")
if os.path.isdir(root):
http://log.info("The root is: %s. ", root)
http://log.info("Files with these extension names will be inspected: %s", self.args.exts)
self.walk_dir(root)
else:
http://log.info("Wow, only a single file will be processed: %s", root)
self.convert_file(root)
http://log.info("Finished all.")
# end of def run(self, root):
def clean_backups(dirname):
if not os.path.isdir(dirname):
log.error("The file specified %s is not a directory ", dirname)
return
now = time.time()
last40min = now - 60 * 40
http://log.info("Removing all newly-created .bak files under %s", dirname)
for root, dirs, files in os.walk(dirname):
for name in files:
extension = os.path.splitext(name)[1][1:]
if extension == 'bak':
fullname = os.path.join(root, name)
ctime = os.path.getctime(fullname)
if ctime > last40min:
os.remove(fullname)
http://log.info("Removed the file: %s", fullname)
def cli():
parser = argparse.ArgumentParser(
prog='cvt2utf8',
description="A tool that converts non-UTF-encoded text files UTF-8 encoded files.",
epilog="You can use this tool to remove BOM from .php source code files, or convert other target_encoding into UTF-8")
parser.add_argument(
'root',
metavar = "filename",
help = textwrap.dedent('''
the path pointing to the file or directory.
If it's a directory, files contained in it with specified extensions will be converted to UTF-8.
Otherwise, if it's a file, only that file will be converted to UTF-8.''')
)
parser.add_argument(
'-e',
'--exts',
nargs = '+', # '+'. Just like '*', all command-line args present are gathered into a list.
default = DEFAULT_CONF['exts'],
help = "the list of file extensions. Only those files ending with extensions in this list will be converted.",
)
parser.add_argument(
'-o',
'--overwrite',
action = 'store_true',
default = DEFAULT_CONF['overwrite'],
help = "Danger! If you turn this switch on, it would directly overwrite existing file without creating any backups.",
)
parser.add_argument(
'-u',
'--cvtutf',
action = 'store_true',
dest = 'convert_utf',
default = DEFAULT_CONF['convert_UTF'],
help = "By default, we will skip files whose encodings are UTF (including UTF-8 and UTF-16), and BOM headers in these files will remain unchanged. "
"But, if you want to change BOM headers for these files, you could utilize this option to change their signatures.",
)
parser.add_argument(
'-b',
'--addbom',
action = 'store_true',
dest = 'add_bom',
default = DEFAULT_CONF['add_BOM'],
help = "If this command line argument is missing, we convert files to UTF-8 without BOM (i.e. the target encoding would be just 'utf-8'). "
"But with this flag, we would add BOM in encoded text files (i.e. the target encoding would be 'utf-8-sig').",
)
parser.add_argument(
'-c',
'--cleanbak',
action = 'store_true',
dest = 'clean_bak',
default = False,
help = textwrap.dedent('''Clean all .bak files generated within last 40 minutes.
When enabled, no files will be converted to UTF-8. Use this flag with extra caution! '''),
)
args = parser.parse_args()
if args.clean_bak:
clean_backups(args.root)
else:
args.target_encoding = 'utf-8-sig' if args.add_bom else 'utf-8'
cvt2utf8 = Convert2Utf8(args)
cvt2utf8.run()
if __name__ == '__main__':
cli()