初次写python应用,比较简单的转换文件编码,用来将文件转换到指定编码,主要利用了 open 文件操作,os 目录遍历,chardet 编码探测,解决 movist(多字幕播放器) 只能正确读取utf-8字幕文件问题,一次将目录下所有字幕都转换城utf-8编码.
ps:发现使用多线程后,时间反而会上升一倍,看来对于小任务线程还是开销比较大的
/Users/yiminghe/code/python/tools/encode.py :
# -*- coding: utf-8 -*-
import sys,os,shutil,traceback,time
from chardet.universaldetector import UniversalDetector
#deal with chinese
encodes={
"gb2312":"gb18030",
"gbk":"gb18030"
}
class HeEncodingEx(Exception):
def __init__(self,msg):
Exception.__init__(self,msg);
def gb(encoding):
if encoding is None:
raise HeEncodingEx,"unknown encoding"
encoding=encoding.strip().lower()
return encodes[encoding] if encoding in encodes else encoding
def transferToEncoding(filename,toCode):
'''
save the content of filename to filename with toCode text encoding
@param filename{string}: text file
@param toCode{string}: text encoding code ,gbk,utf-8...etc
@return{boolean}: operation success true/false
'''
if(os.path.isdir(filename)):
print "error:not file"
return False
try:
detector = UniversalDetector()
#print filename
#read content
f=open(filename,"r")
ls=f.readlines();
f.close();
#detect encoding
for l in ls:
detector.feed(l)
if detector.done: break
detector.close()
#print detector.result
#print dir(detector.result)
encode=gb(detector.result['encoding'])
#print "original encoding:",encode
if(encode.lower() != toCode.lower()):
#backup orginal file
if not os.path.exists(filename+".bak"):
shutil.copy(filename, filename+".bak")
#save to another encoding
f=open(filename,"w")
for l in ls:
f.write(unicode(l,encode).encode(toCode))
f.close()
#print "result encoding:"+toCode
else:
pass
#print "same encoding"
except BaseException,e:
#print "error:",e
traceback.print_exc()
#restore
if(os.path.exists(filename+".bak")):
shutil.copy(filename+".bak", filename)
return False
finally:
print
print
return True
#main
if __name__=="__main__":
start=time.time()
if len(sys.argv)<2:
print "erro argv! filename toCoding"
sys.exit(1)
#default transfer to utf-8
toCode=sys.argv[2] if len(sys.argv) > 2 else "utf-8"
filename=sys.argv[1]
if(os.path.isfile(filename)):
transferToEncoding(filename,toCode)
else:
import threading
#同时10个线程处理文件
THREAD_NUM=10
lock=threading._allocate_lock()
def fetchAndProcess(files,func):
'''
每次取一个元素运行
@param files{Array}:数据存放数组
@param func{Function}:处理函数
'''
while len(files):
lock.acquire()
if len(files)==0:
break
try:
file_=files.pop()
except IndexError,e:
print e
break
print threading.current_thread().ident," got : ",file_
lock.release()
func(file_,toCode)
#folder? then walk
all_files=[]
for base,folders,files in os.walk(filename):
if not base.endswith(os.sep) :
base+=os.sep
for file_ in files:
if file_.lower().endswith("srt"):
all_files.append(base+file_)
if 0:
transferToEncoding(base+file_,toCode)
if 1:
num=THREAD_NUM
threads=[];
#print all_files;
while num:
num-=1
threads.append(threading.Thread(target=fetchAndProcess,args=(all_files,transferToEncoding)))
for thread_ in threads:
thread_.start()
for thread_ in threads:
thread_.join()
#10,20,30个线程 40。6秒
#单线程 28.1秒
print "consume time :",time.time()-start
/Users/yiminghe/code/python/tools/ he_encode.sh:
#!/bin/bash - #"$@" ,not $* ,$@,"$*" python /Users/yiminghe/code/python/tools/encode.py "$@"
创建软链接
chmod 777 /Users/yiminghe/code/python/tools/he_encode.sh ln -s /Users/yiminghe/code/python/tools/he_encode.sh /usr/bin
运行:
定位在某个目录下运即可:转换目录下的所有字幕文件为utf-8格式
he_encode .