Python3批量转换文本文件编码 |
002 | try : |
003 | from chardet.universaldetector import UniversalDetector |
004 | IsAuto = True |
005 | except ImportError: |
006 | IsAuto = False |
007 | import os |
008 | import os.path |
009 | import glob |
010 |
011 | def Convert_Auto( filename,out_enc = "utf-8" ): |
012 | ''' Re-encode text file with auto detec current encode. Need chardet Lib. |
013 | Input Parameter: |
014 | filename: full path and file name, e.g. c:\dir1\file.txt |
015 | out_enc: new encode. Default as 'utf-8' |
016 | Output Parameter |
017 | None''' |
018 | try : |
019 | f = open (filename, 'rb' ) |
020 | b = b ' ' |
021 | b + = f.read( 1024 ) |
022 | u = UniversalDetector() |
023 | u.reset() |
024 | u.feed(b) |
025 | u.close() |
026 | f.seek( 0 ) |
027 | b = f.read() |
028 | f.close() |
029 | in_enc = u.result[ 'encoding' ] |
030 | new_content = b.decode(in_enc, 'ignore' ) |
031 | f = open (filename, 'w' , encoding = out_enc) |
032 | f.write(new_content) |
033 | f.close() |
034 | print ( "Success: " + filename + " converted from " + in_enc + " to " + out_enc + " !" ) |
035 | except IOError: |
036 | print ( "Error: " + filename + " FAIL to converted from " + in_enc + " to " + out_enc + " !" ) |
037 |
038 | def Convert_Manu( filename,in_enc = 'gbk' , out_enc = "utf-8" ): |
039 | ''' Re-encode text file with manual decide input text encode. |
040 | Input Parameter: |
041 | filename: full path and file name, e.g. c:\dir1\file.txt |
042 | in_enc: current encode. Default as 'gbk' |
043 | out_enc: new encode. Default as 'utf-8' |
044 | Output Parameter |
045 | None''' |
046 | try : |
047 | print ( "convert " + filename) |
048 | f = open (filename, 'rb' ) |
049 | b = f.read() |
050 | f.close() |
051 | new_content = b.decode(in_enc, 'ignore' ) |
052 | f = open (filename, 'w' , encoding = out_enc) |
053 | f.write(new_content) |
054 | f.close() |
055 | print ( "Success: " + filename + " converted from " + in_enc + " to " + out_enc + " !" ) |
056 | except IOError: |
057 | print ( "Error: " + filename + " FAIL to converted from " + in_enc + " to " + out_enc + " !" ) |
058 |
059 |
060 | def explore( dir , IsLoopSubDIR = True ): |
061 | '''Convert files encoding. |
062 | Input: |
063 | dir : Current folder |
064 | IsLoopSubDIR: True -- Include files in sub folder |
065 | False-- Only include files in current folder |
066 | Output: |
067 | NONE |
068 | ''' |
069 | if IsLoopSubDIR: |
070 | flist = getSubFileList( dir , '.txt' ) |
071 | else : |
072 | flist = getCurrFileList( dir , '.txt' ) |
073 | for fname in flist: |
074 | if IsAuto: |
075 | Convert_Auto(fname, 'utf-8' ) |
076 | else : |
077 | Convert_Manu(fname, 'gbk' , 'utf-8' ) |
078 |
079 | |
080 | def getSubFileList( dir , suffix = ''): |
081 | '''Get all file list with specified suffix under current folder(Include sub folder) |
082 | Input: |
083 | dir : Current folder |
084 | suffix : default to blank, means select all files. |
085 | Output: |
086 | File list |
087 | ''' |
088 | flist = [] |
089 | for root, dirs, files in os.walk(os.getcwd()): |
090 | for name in files: |
091 | if name.endswith(suffix): |
092 | flist.append(os.path.join(root, name)) |
093 | return flist |
094 |
095 | def getCurrFileList( dir , suffix = ''): |
096 | '''Get all file list with specified suffix under current level folder |
097 | Input: |
098 | dir : Current folder |
099 | suffix : default to blank, means select all files. |
100 | Output: |
101 | File list |
102 | ''' |
103 | if suffix = = '': |
104 | files = glob.glob( '*' ) |
105 | else : |
106 | files = glob.glob( '*' + suffix) |
107 | flist = [] |
108 | for f in files: |
109 | flist.append(os.path.join(os.getcwd(), f)) |
110 | return flist |
111 | |
112 | |
113 | def main(): |
114 | explore(os.getcwd(), True ) |
115 | |
116 | if __name__ = = "__main__" : |
117 | main() |