cbDumper.py转换为py3后的出错情况

最新推荐文章于 2020-11-23 13:52:10 发布

redstoneleo

最新推荐文章于 2020-11-23 13:52:10 发布

阅读量221

点赞数

Python 专栏收录该内容

48 篇文章 0 订阅

订阅专栏

运行2to3转换成py3的代码

D:\py I:\Users\i\AppData\Local\Programs\Python\Python36-32\Tools\scripts\2to3.py -w C:\Users\i\Downloads\cbDumper\cbDumper.py

转换后修改了第15行的代码后的cbDumper.py的内容如下

#This Python file uses the following encoding: utf-8
#Author:Superfan
#Version 1.0
#Latest Update:Aug,08,2011
#Note:This script will convert kingsoft ciba dictionary "dic" file to Mdict source, run it with python VM.
#Usage     :python cbDumper.py [output path] [dictionary path]
#Example   :python cbDumper.py "C:\Output" "C:\Dict\"
#This script is for STUDY use ONLY.DO NOT USE IT FOR ANY COMMERCIAL PURPOSE!!!

import zlib,struct,os,re,sys,time
import xml.parsers.expat

def printLog(str,onlyLog=False):
	global flog
	flog.write(str.encode('utf8')+b'\x0d\x0a')
	flog.flush()
	if onlyLog:return
	print(str)
def getInfo(fileName,dictDir,outDir):
	indexBeginPtr=0x54
	infoBegin=0x78
	#extract info file
	fileSubName=fileName[0:-4]
	fin=open(os.path.join(dictDir,fileName),'rb')
	fin.seek(indexBeginPtr)
	indexBegin=struct.unpack('<I',fin.read(4))[0]
	fin.seek(infoBegin)
	strTmp=str(fin.read(indexBegin-infoBegin),'utf16').encode('utf8')
	i=-1
	while strTmp[i]=='\x00':
		i+=-1
	if i<-1:strTmp=strTmp[:i]
	#~ open(os.path.join(outDir,fileSubName+'_info.xml'),'wb').write(strTmp)
	dicName=re.findall(r'<name>(.*)<\/name>',str(strTmp,'utf8'))[-1]
	dicCount=re.findall(r'<item_count>(.*)<\/item_count>',str(strTmp,'utf8'))[0]
	return dicName,dicCount
	
def decompress(fileName,dictDir,outDir):
	indexBeginPtr=0x54
	sizeIndexPtr=0x58
	infoBegin=0x78
	garbageSize=0
	buffer=''
	offsets=[]
	fileSubName=fileName[0:-4]
	fin=open(os.path.join(dictDir,fileName),'rb')
	fin.seek(indexBeginPtr)
	indexBegin=struct.unpack('<I',fin.read(4))[0]
	#get sizes
	fin.seek(sizeIndexPtr)
	sizeIndex=struct.unpack('<I4xI4xI4x',fin.read(24))
	for size in sizeIndex:
		garbageSize+=size
	#~ print garbageSize
	#get index
	fin.seek(indexBegin)
	offset=struct.unpack('<I',fin.read(4))[0]
	while offset<=0xFFFF:
		offsets.append(offset)
		offset=struct.unpack('<I',fin.read(4))[0]
	fin.seek(fin.tell()-4)
	#begin decompressing data
	finSize=os.path.getsize(os.path.join(dictDir,fileName))
	fout=open(os.path.join(outDir,'%s_raw.data'%fileSubName),'wb')
	garbagePassed=False
	printLog('Decompressing %s'%(fileName),True)
	for offset in offsets:
		if garbagePassed==False:
			buffer=zlib.decompress(fin.read(offset))
			print('Decompressing %s...%%%.2f\r'%(fileName,fin.tell()/finSize*100), end=' ')
			if len(buffer)>garbageSize:
				fout.write(buffer[garbageSize:])
				garbagePassed=True
			else:
				garbageSize-=len(buffer)
		else:
			fout.write(zlib.decompress(fin.read(offset)))
			print('Decompressing %s...%%%.2f\r'%(fileName,fin.tell()/finSize*100), end=' ')
	#print fin.tell()

def split(fileName,outDir):
	fileSubName=fileName[0:-4]
	fin=open(os.path.join(outDir,'%s_raw.data'%fileSubName),'rb')
	size=struct.unpack('<I4x',fin.read(8))[0]
	while size!=0:
		yield fin.read(size)[:-2]
		strtmp=fin.read(8)
		if len(strtmp)!=8:return
		size=struct.unpack('<I4x',strtmp)[0]
	return
class xmlParser:
	def __init__(self,strXML,outDir,fileName,num,enableRecursion):
		self.strXML=strXML
		self.outDir=outDir
		self.xmlParsed=''
		self.name=''
		self.fileName=fileName
		self.enableRecursion=enableRecursion
		self.fileSubName=fileName[0:-4]
		self.num=num
		self.stack=[]
		self.subEntries=[]
		self.dataBuffer=''
		self.failNum=0
		self.entryNum=0
		self.subEntryNum=0
		try:
			self.parse()
		except Exception as e:
			self.handler(e)
	def handler(self,e):
		printLog(str(e))
		printLog('Error occured while processing %s_%d'%(self.fileSubName,self.num))
		if os.path.exists(os.path.join(outDir,'%s_%d.txt'%(self.fileSubName,self.num))) and self.enableRecursion:
			printLog('Patch %s_%d.txt found'%(self.fileSubName,self.num))
			P=xmlParser(open(os.path.join(outDir,'%s_%d.txt'%(self.fileSubName,self.num)),'rb').read(),self.outDir,self.fileName,self.num,False)
			self.strXML=P.strXML
		else:
			printLog('Open "fail.log",correct the error(s) and save.Press enter to resume.')
			fTmp=open(os.path.join(self.outDir,'fail.log'),'wb')
			fTmp.write(self.strXML)
			fTmp.close()
			input()
			P=xmlParser(open(os.path.join(self.outDir,'fail.log'),'rb').read(),self.outDir,self.fileName,self.num,False)
			self.strXML=P.strXML
			open(os.path.join(outDir,'%s_%d.txt'%(self.fileSubName,self.num)),'wb').write(self.strXML)
		printLog('Error corrected and patch file saved,continue.')
	def StartElementHandler(self,name,attrs):
		self.stack.append(name)
		if attrs!={}:
			print(attrs)
			print(strXML)
			input()
		if name in ['DX','XB','JC','TS','CC','XY','CZ','PS','YY','YF','YX']:self.entryNum=0
		self.specialProcess(name)
		self.flushBuffer(name)
		self.xmlParsed+='<span class="%s">'%name	#write current tag
		self.writeTitle(name)
	def writeTitle(self,name):
		index={'XB':'词性变化','JC':'继承用法','TS':'特殊用法','CC':'参考词汇','XY':'习惯用语','CZ':'常用词组','PS':'派生','YY':'语源','YF':'用法',}
		if name in list(index.keys()):self.xmlParsed+='<hr><span class="section_title">%s</span>'%index[name]
	def EndElementHandler(self,name):
		self.stack.pop()
		self.specialProcess(name)
		if name=='JX' and re.search(r'\S',self.dataBuffer)!=None:
			self.entryNum+=1
			self.xmlParsed+='<span class="entryNum">%d.</span>'%self.entryNum
			self.xmlParsed+='<span class="entryDot">■</span>'
		if (name=='YX' or (name=='JX' and self.fileName=='1#509.dic')) and (list(set(['JC','TS','XY','CZ','PS'])&set(self.stack))!=[]):
			self.subEntryNum+=1
			self.subEntries.append((self.dataBuffer,self.name,self.subEntryNum))
			self.xmlParsed+='<a name="subEntry%d"/>'%(self.subEntryNum)
		if name in ['CB','PY']:
			if self.dataBuffer!='' and re.search(r'\S',self.dataBuffer)!=None and self.dataBuffer[0]!='[':
				if self.dataBuffer[:4]=='D.J.' or self.dataBuffer[:4]=='K.K.':
					self.dataBuffer='%s[%s]'%(self.dataBuffer[:4],self.dataBuffer[4:])
				else:
					self.dataBuffer='[%s]'%(self.dataBuffer)
		self.flushBuffer(name)
		self.xmlParsed+=r'</span>'
	def CharacterDataHandler(self,data):
		self.dataBuffer+=data
	def flushBuffer(self,name):
		if name=='DC':self.name=self.dataBuffer
		if self.dataBuffer!='':
			self.xmlParsed+=self.dataBuffer
			self.dataBuffer=''
	def parse(self):
		p=xml.parsers.expat.ParserCreate('UTF-16')
		p.StartElementHandler=self.StartElementHandler
		p.EndElementHandler=self.EndElementHandler
		p.CharacterDataHandler=self.CharacterDataHandler
		p.returns_unicode=True
		p.Parse(self.strXML)
	def specialProcess(self,name):
		if re.search(r'\S',self.dataBuffer)==None:self.dataBuffer=''
		regs=(r'&i\{([^{}]*)\}',r'&b\{([^{}]*)\}',r'&[ul]\{\s*([^{}]*)\s*\}',r'&\+\{([^{}]*)\}',r'&\-\{([^{}]*)\}',r'&amp;',r'&x\{([^{}]*)\}',r'\+\[\d+\]',r'&lt;',r'&gt;')
		if name=='DC':
			repls=('\g<1>','\g<1>','\g<1>','g<1>','\g<1>','&','\g<1>','','<','>')
		else:
			repls=('<span class="italic">\g<1></span>','<span class="bold">\g<1></span>','<a href="entry://\g<1>">\g<1></a>;','<span class="superscript">\g<1></span>','<span class="subscript">\g<1></span>','&','\g<1>','','<','>')
		matchList=list(zip(repls,regs))
		if '&' in self.dataBuffer or '+' in self.dataBuffer:
			isReplaced=True
			while isReplaced:
				isReplaced=False
				for repl,reg in matchList:
					self.dataBuffer,repNum=re.subn(reg, repl, self.dataBuffer,flags=re.IGNORECASE|re.DOTALL)
					if repNum>0:isReplaced=True
		return
if __name__=='__main__':
	opt = sys.argv[1:]
	if len(opt)>=1:
		if not os.path.isabs(opt[0]):
			print('\''+opt[0]+'\''+' is not a valid directory name!')
			sys.exit()
		else:outDir=re.match(r'(.*?)\\*$',opt[0]).group(1)
	else:outDir=os.getcwd()
	if len(opt)>=2:
		if not os.path.exists(opt[1]):
			print('\''+opt[1]+'\''+' doesn\'t exsists!')
			sys.exit()
		else:dictDir=re.match(r'(.*?)\\*$',opt[1]).group(1)
	else:dictDir=os.getcwd()
	#get dictionaries
	global flog
	flog=open(os.path.join(outDir,'log.txt'),'wb')
	fileNames=[]
	items=0
	entries=0
	printLog('Target directory:%s'%dictDir)
	printLog('Working(output) directory:%s'%outDir)
	printLog('Scanning directory:',True)
	for root,dirs,files in os.walk(dictDir):
		for fileName in [file.lower() for file in files]:
			if fileName[-3:]=='dic':
				fileSubName=fileName[0:-4]
				dicName,dicCount=getInfo(fileName,dictDir,outDir)
				printLog('%s DicName:%s  ,DicCount:%s'%(fileName,dicName,dicCount))
				print('Add to conversion list(y/n)?')
				if input().lower()=='y':fileNames.append((fileName,dicName))
	print('Conversion List as follows ,continue(y/n)?:')
	printLog('Conversion List:',True)
	for fileName,dicName in fileNames:
		printLog('%-10s  %-10s'%(fileName,dicName),False)
	if input().lower()!='y':os._exit(0)
	fout=open(os.path.join(outDir,'output.txt'),'wb')
	printLog('Start time:%s'%time.ctime())
	timeStart=time.time()
	#decompressing and processing
	for fileName,dicName in fileNames:
		decompress(fileName,dictDir,outDir)
		#~ fout=open('%s.txt'%fileSubName,'wb')
		printLog('\r\n')
		sizeProcessed=0
		sizeTotal=os.path.getsize(os.path.join(outDir,'%s_raw.data'%fileName[0:-4]))
		num=0
		printLog('Splitting&Converting %s...'%(fileName[0:-4]),True)
		for strXML in split(fileName,outDir):
			num+=1
			sizeProcessed+=len(strXML)+8
			print('Splitting&Converting %s...%%%.2f\r'%(fileName[0:-4],sizeProcessed/sizeTotal*100), end=' ')
			assert strXML[-2:]=='\x0a\x00'
			#~ fout.write(unicode(strXML,'utf16').encode('utf8'))
			P=xmlParser(strXML,outDir,fileName,num,True)
			if (P.name!='' and P.name[0] not in [' ','\n']) or re.search(r'\S',P.name)!=None:
				items+=1
				entries+=1
				fout.write(P.name.encode('UTF-8'))
				fout.write('\x0d\x0a<link rel="stylesheet" type="text/css" href="sf_cb.css"/>\x0d\x0a'+P.xmlParsed.encode('UTF-8')+'\x0d\x0a</>\x0d\x0a')
				#write subEntries
				for subEntry,name,subEntryNum in P.subEntries:
					if (subEntry!='' and subEntry[0] not in [' ','\n']) or re.search(r'\S',subEntry)!=None:
						items+=1
						fout.write(subEntry.encode('utf-8'))
						fout.write('\x0d\x0a<link rel="stylesheet" type="text/css" href="sf_cb.css"/>\x0d\x0a')
						fout.write('<span class="reference"><span class="ref_title">见:</span>'.encode('utf-8'))
						fout.write('<a href="entry://%s#subEntry%d">%s: %s</a></span>\x0d\x0a</>\x0d\x0a'%(name.encode('utf-8'),subEntryNum,name.encode('utf-8'),subEntry.encode('utf-8')))
					else:printLog('Jump over subentry %s_%d_%d,void name.'%(fileName,num,subEntryNum))
			else:printLog('Jump over %s_%d,void name.'%(fileName,num))
		printLog('\r\n%s:%s converted,%d entries'%(fileName,dicName,num))
	printLog('All conversion completed.Total:%d items,%d entries'%(items,entries))
	printLog('End time:%s'%time.ctime())
	timeEnd=time.time()
	printLog('Total time elapsed:%s'%time.strftime('%H Hours %M minutes %S Seconds',time.gmtime(timeEnd-timeStart)))
	#~ os.system("shutdown.exe -s")
	#~ printLog('shutting down computer...')

按照py文件里的使用方法运行命令出错，怎么解决呢？

D:\>py C:\Users\i\Downloads\cbDumper\cbDumper.py C:\Users\i\Dow
nloads\cbDumper "C:\Users\i\AppData\Local\Kingsoft\Power Word 2016\2016.3.3.0333
\dicts"
Target directory:C:\Users\i\AppData\Local\Kingsoft\Power Word 2016\2016.3.3.0333
\dicts
Working(output) directory:C:\Users\i\Downloads\cbDumper
1#900.dic DicName:简明英汉词典  ,DicCount:8000
Add to conversion list(y/n)?
y
1#901.dic DicName:简明汉英词典  ,DicCount:8000
Add to conversion list(y/n)?
y
Conversion List as follows ,continue(y/n)?:
1#900.dic   简明英汉词典
1#901.dic   简明汉英词典
y
Start time:Wed Oct 10 13:25:56 2018
Traceback (most recent call last):
  File "C:\Users\i\Downloads\cbDumper\cbDumper.py", line 232, in <module>
    decompress(fileName,dictDir,outDir)
  File "C:\Users\i\Downloads\cbDumper\cbDumper.py", line 69, in decompress
    buffer=zlib.decompress(fin.read(offset))
zlib.error: Error -3 while decompressing data: incorrect header check

redstoneleo

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
cbDumper.py转换为py3后的出错情况

运行2to3转换成py3的代码D:\py I:\Users\i\AppData\Local\Programs\Python\Python36-32\Tools\scripts\2to3.py -w C:\Users\i\Downloads\cbDumper\cbDumper.py转换后修改了第15行的代码后的cbDumper.py的内容如下#This Python file use...
复制链接

扫一扫