0.背景
- 论文提及关键词提取,需要优化分词效果,为此需要领域词典。
- 但是搜狗和百度下载下来的词典文件无法直接处理,需要转为txt。
1.搜狗词库和百度词库
- https://pinyin.sogou.com/dict/
- https://shurufa.baidu.com/dict_list
2.搜狗词库文件.scel转.txt
- 在线工具-亲测有效:http://tools.bugscaner.com/sceltotxt/
3.百度词库文件.bdict转.txt
import struct
import binascii
class Baidu(object):
def __init__(self, originfile):
self.originfile = originfile
self.lefile = originfile + '.le'
self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt'
self.buf = [b'0' for x in range(0,2)]
self.listwords = []
def be2le(self):
of = open(self.originfile,'rb')
lef = open(self.lefile, 'wb')
contents = of.read()
contents_size = contents.__len__()
mo_size = (contents_size % 2)
if mo_size > 0:
contents_size += (2-mo_size)
contents += contents + b'0000'
for i in range(0, contents_size, 2):
self.buf[1] = contents[i]
self.buf[0] = contents[i+1]
le_bytes = struct.pack('2B', self.buf[0], self.buf[1])
lef.write(le_bytes)
print('写入成功转为小端的字节流')
of.close()
lef.close()
def le2txt(self):
lef = open(self.lefile, 'rb')
txtf = open(self.txtfile, 'w')
le_bytes = lef.read().hex()[0x350:]
i = 0
while i<len(le_bytes):
result = le_bytes[i:i+4]
i+=4
content = binascii.a2b_hex(result).decode('utf-16-be')
if '\u4e00' <= content <= '\u9fff':
self.listwords.append(content)
else:
if self.listwords:
word = ''.join(self.listwords)
txtf.write(word + '\n')
self.listwords = []
print('写入txt成功')
lef.close()
txtf.close()
if __name__ == '__main__':
path = '你的.bdict文件'
bd = Baidu(path)
bd.be2le()
bd.le2txt()
4.参考文献
- https://blog.csdn.net/qiuwen_521/article/details/122056981