c++ uint32转为int_【转】用python将GBK编码文件转为UTF-8编码文件

学习资料:

字符集与字符编码 - 网易云课堂​study.163.com

有的时候我们有一些网页的项目,需要用到JavaScript读取一些文本文件,用以读取数据;但各种文本文件的编码方式不尽相同,特别是带有中文字符的文件,为GBK编码,一般加载后都会出现乱码情况,故需要在加载之前将文件的编码形式转为国际兼容的编码方式UTF-8。乱码也是一个很烦的问题,博主苦寻良久,终于找到了相应的解决方案,这个python程序对单个文件或者整个文件夹下的文件进行批量转码操作,经过实例测试,代码有效,代码中文件类型是自己设置的,本文文件格式为"cfg",可根据项目需要在程序内修改文件格式,程序代码如下:字符集与字符编码 - 网易云课堂有的时候我们有一些网页的项目,需要用到JavaScript读取一些文本文件,用以读取数据;但各种文本文件的编码方式不尽相同,特别是带有中文字符的文件,为GBK编码,一般加载后都会出现乱码情况,故需要在加载之前将文件的编码形式转为国际兼容的编码方式UTF-8。乱码也是一个很烦的问题,博主苦寻良久,终于找到了相应的解决方案,这个python程序对单个文件或者整个文件夹下的文件进行批量转码操作,经过实例测试,代码有效,代码中文件类型是自己设置的,本文文件格式为"cfg",可根据项目需要在程序内修改文件格式,程序代码如下:

gbk2utf.py

  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-

  3. __author__ = ''

  4. import logging, os, argparse, textwrap

  5. import time

  6. import chardet
  7. # Default configuration will take effect when corresponding input args are missing.
  8. # Feel free to change this for your convenience.

  9. DEFAULT_CONF = {
  10. # Only those files ending with extensions in this list will be scanned or converted.

  11. 'exts' : ['cfg'],

  12. 'overwrite' : False,

  13. 'add_BOM' : False,

  14. 'convert_UTF' : False,

  15. 'confi_thres' : 0.8,

  16. }
  17. # We have to set a minimum threshold. Only those target_encoding results returned by chartdet that are above that threshold level would be accepted.
  18. # See https://github.com/x1angli/convert2utf/issues/4 for further details

  19. logging.basicConfig(format='%(levelname)s:%(message)s', level=http://logging.INFO)

  20. log = logging.getLogger(__name__)

  21. class Convert2Utf8:

  22. def __init__(self, args):

  23. self.args = args

  24. def walk_dir(self, dirname):

  25. for root, dirs, files in os.walk(dirname):

  26. for name in files:

  27. extension = os.path.splitext(name)[1][1:].strip().lower()
  28. # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the 'n'
  29. # Also, add 'lower()' to ensure matching

  30. if (extension in self.args.exts):

  31. fullname = os.path.join(root, name)

  32. try:

  33. self.convert_file(fullname)

  34. except IOError:

  35. log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)

  36. except KeyboardInterrupt:

  37. log.warning("Interrupted by keyboard (e.g. Ctrl+C)")

  38. exit()
  39. # else:
  40. # log.error("Unable to process the file: %s. Please check.", fullname)
  41. # traceback.print_stack()

  42. def convert_file(self, filename):

  43. with open(filename, 'rb') as f: # read under the binary mode

  44. bytedata = f.read()

  45. if len(bytedata) == 0:
  46. http://log.info("Skipped empty file %s", filename)

  47. return

  48. chr_res = chardet.detect(bytedata)

  49. if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:

  50. log.warning("Ignoring %s, since its encoding is unable to detect.", filename)

  51. return

  52. src_enc = chr_res['encoding'].lower()

  53. log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)

  54. if (src_enc == 'ascii'):
  55. http://log.info("Skipped %s, whose encoding is %s", filename, src_enc)

  56. return

  57. if (not self.args.convert_utf) and src_enc.startswith('utf'):
  58. http://log.info("Skipped %s, whose encoding is %s", filename, src_enc)

  59. return
  60. # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
  61. # contains certain special charaters. To make it more special-character-tolerant, we should
  62. # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.

  63. if src_enc.lower() == 'gb2312':

  64. src_enc = 'gb18030'

  65. try:

  66. strdata = bytedata.decode(src_enc)

  67. except UnicodeDecodeError as e:

  68. log.error("Unicode error for file %s", filename)

  69. print(e)

  70. return
  71. # preserving file time information (modification time and access time)

  72. src_stat = os.stat(filename)
  73. # if the 'overwrite' flag is 'False', we would make a backup of the original text file.

  74. if not self.args.overwrite:

  75. backup_name = filename + '.' + str(int(round(time.time() * 1000))) + '.bak'
  76. http://log.info("Renaming %s to %s", filename, backup_name)

  77. os.rename(filename, backup_name)

  78. tgt_enc = self.args.target_encoding

  79. log.debug("Writing the file: %s in %s", filename, tgt_enc)

  80. with open(filename, 'wb') as f: # write under the binary mode

  81. f.write(strdata.encode(tgt_enc))
  82. http://log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
  83. # setting the new file's time to the old file

  84. os.utime(filename, times = (src_stat.st_atime, src_stat.st_ctime))
  85. # end of def convert_file(self, filename)

  86. def run(self):

  87. root = self.args.root

  88. if not os.path.exists(root):

  89. log.error("The file specified %s is neither a directory nor a regular file", root)

  90. return
  91. http://log.info("Start working now!")

  92. if os.path.isdir(root):
  93. http://log.info("The root is: %s. ", root)
  94. http://log.info("Files with these extension names will be inspected: %s", self.args.exts)

  95. self.walk_dir(root)

  96. else:
  97. http://log.info("Wow, only a single file will be processed: %s", root)

  98. self.convert_file(root)
  99. http://log.info("Finished all.")
  100. # end of def run(self, root):

  101. def clean_backups(dirname):

  102. if not os.path.isdir(dirname):

  103. log.error("The file specified %s is not a directory ", dirname)

  104. return

  105. now = time.time()

  106. last40min = now - 60 * 40
  107. http://log.info("Removing all newly-created .bak files under %s", dirname)

  108. for root, dirs, files in os.walk(dirname):

  109. for name in files:

  110. extension = os.path.splitext(name)[1][1:]

  111. if extension == 'bak':

  112. fullname = os.path.join(root, name)

  113. ctime = os.path.getctime(fullname)

  114. if ctime > last40min:

  115. os.remove(fullname)
  116. http://log.info("Removed the file: %s", fullname)

  117. def cli():

  118. parser = argparse.ArgumentParser(

  119. prog='cvt2utf8',

  120. description="A tool that converts non-UTF-encoded text files UTF-8 encoded files.",

  121. epilog="You can use this tool to remove BOM from .php source code files, or convert other target_encoding into UTF-8")

  122. parser.add_argument(

  123. 'root',

  124. metavar = "filename",

  125. help = textwrap.dedent('''

  126. the path pointing to the file or directory.

  127. If it's a directory, files contained in it with specified extensions will be converted to UTF-8.

  128. Otherwise, if it's a file, only that file will be converted to UTF-8.''')

  129. )

  130. parser.add_argument(

  131. '-e',

  132. '--exts',

  133. nargs = '+', # '+'. Just like '*', all command-line args present are gathered into a list.

  134. default = DEFAULT_CONF['exts'],

  135. help = "the list of file extensions. Only those files ending with extensions in this list will be converted.",

  136. )

  137. parser.add_argument(

  138. '-o',

  139. '--overwrite',

  140. action = 'store_true',

  141. default = DEFAULT_CONF['overwrite'],

  142. help = "Danger! If you turn this switch on, it would directly overwrite existing file without creating any backups.",

  143. )

  144. parser.add_argument(

  145. '-u',

  146. '--cvtutf',

  147. action = 'store_true',

  148. dest = 'convert_utf',

  149. default = DEFAULT_CONF['convert_UTF'],

  150. help = "By default, we will skip files whose encodings are UTF (including UTF-8 and UTF-16), and BOM headers in these files will remain unchanged. "

  151. "But, if you want to change BOM headers for these files, you could utilize this option to change their signatures.",

  152. )

  153. parser.add_argument(

  154. '-b',

  155. '--addbom',

  156. action = 'store_true',

  157. dest = 'add_bom',

  158. default = DEFAULT_CONF['add_BOM'],

  159. help = "If this command line argument is missing, we convert files to UTF-8 without BOM (i.e. the target encoding would be just 'utf-8'). "

  160. "But with this flag, we would add BOM in encoded text files (i.e. the target encoding would be 'utf-8-sig').",

  161. )

  162. parser.add_argument(

  163. '-c',

  164. '--cleanbak',

  165. action = 'store_true',

  166. dest = 'clean_bak',

  167. default = False,

  168. help = textwrap.dedent('''Clean all .bak files generated within last 40 minutes.

  169. When enabled, no files will be converted to UTF-8. Use this flag with extra caution! '''),

  170. )

  171. args = parser.parse_args()

  172. if args.clean_bak:

  173. clean_backups(args.root)

  174. else:

  175. args.target_encoding = 'utf-8-sig' if args.add_bom else 'utf-8'

  176. cvt2utf8 = Convert2Utf8(args)

  177. cvt2utf8.run()

  178. if __name__ == '__main__':

  179. cli()

执行:

582eee87c798919cf24e4b990ea2c891.png

结果:

3735ab8ab603971284ad69a6bcd4d328.png

如果觉得本文对您有用就点个赞呗!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值