百度输入法的词库 bdict 转 txt

Python
<br />\#encoding:utf-8 import struct import binascii class Baidu(object): def __init__(self, originfile): self.originfile = originfile self.lefile = originfile + '.le' self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt' self.buf = [b'0' for x in range(0,2)] self.listwords = [] #字节流大端转小端 def be2le(self): of = open(self.originfile,'rb') lef = open(self.lefile, 'wb') contents = of.read() contents_size = contents.__len__() mo_size = (contents_size % 2) #保证是偶数 if mo_size &gt; 0: contents_size += (2-mo_size) contents += contents + b'0000' #大小端交换 for i in range(0, contents_size, 2): self.buf[1] = contents[i] self.buf[0] = contents[i+1] le_bytes = struct.pack('2B', self.buf[0], self.buf[1]) lef.write(le_bytes) print('写入成功转为小端的字节流') of.close() lef.close() def le2txt(self): lef = open(self.lefile, 'rb') txtf = open(self.txtfile, 'w') #以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350 le_bytes = lef.read().hex()[0x350:] i = 0 while i&lt;len(le_bytes): result = le_bytes[i:i+4] i+=4 #将所有字符解码成汉字,拼音或字符 content = binascii.a2b_hex(result).decode('utf-16-be') #判断汉字 if '\u4e00' &lt;= content &lt;= '\u9fff': self.listwords.append(content) else: if self.listwords: word = ''.join(self.listwords) txtf.write(word + '\n') self.listwords = [] print('写入txt成功') lef.close() txtf.close() if __name__ == '__main__': path = './dict_file_565_20111206100521_1.0.0.bdict' bd = Baidu(path) bd.be2le() bd.le2txt()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
< br / > \ #encoding:utf-8
import struct
import binascii
 
class Baidu ( object ) :
 
     def __init__ ( self , originfile ) :
         self . originfile = originfile
         self . lefile = originfile + '.le'
         self . txtfile = originfile [ 0 : ( originfile . __len__ ( ) - 5 ) ] + 'txt'
         self . buf = [ b '0' for x in range ( 0 , 2 ) ]
         self . listwords = [ ]
 
     #字节流大端转小端
     def be2le ( self ) :
         of = open ( self . originfile , 'rb' )
         lef = open ( self . lefile , 'wb' )
         contents = of . read ( )
         contents_size = contents . __len__ ( )
         mo_size = ( contents_size % 2 )
         #保证是偶数
         if mo_size & gt ; 0 :
             contents_size += ( 2 - mo_size )
             contents += contents + b '0000'
         #大小端交换
         for i in range ( 0 , contents_size , 2 ) :
             self . buf [ 1 ] = contents [ i ]
             self . buf [ 0 ] = contents [ i + 1 ]
             le_bytes = struct . pack ( '2B' , self . buf [ 0 ] , self . buf [ 1 ] )
             lef . write ( le_bytes )
         print ( '写入成功转为小端的字节流' )
         of . close ( )
         lef . close ( )
 
     def le2txt ( self ) :
         lef = open ( self . lefile , 'rb' )
         txtf = open ( self . txtfile , 'w' )
         #以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350
         le_bytes = lef . read ( ) . hex ( ) [ 0x350 : ]
         i = 0
         while i & lt ; len ( le_bytes ) :
             result = le_bytes [ i : i + 4 ]
             i += 4
             #将所有字符解码成汉字,拼音或字符
             content = binascii . a2b_hex ( result ) . decode ( 'utf-16-be' )
             #判断汉字
             if '\u4e00' & lt ; = content & lt ; = '\u9fff' :
                 self . listwords . append ( content )
             else :
                 if self . listwords :
                     word = '' . join ( self . listwords )
                     txtf . write ( word + '\n' )
                 self . listwords = [ ]
         print ( '写入txt成功' )
         lef . close ( )
         txtf . close ( )
 
if __name__ == '__main__' :
     path = './dict_file_565_20111206100521_1.0.0.bdict'
     bd = Baidu ( path )
     bd . be2le ( )
     bd . le2txt ( )
 
 
 



  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值