```
# -*- coding: utf-8 -*-
import os
import codecs
import math
import operator
import re
from xpinyin import Pinyin
pin = Pinyin()
patent_file = "in.txt"
corpus = {}
f = open(patent_file, 'r', encoding='utf-8')
for line in f.readlines():
match = re.match("(\d+):(.+)",line)
if match == None:
print(line)
else:
ID = match.group(1)
content = str(line)
corpus[ID] = content
f.close()
cmpDict = {}
f = open(patent_file, 'r', encoding='utf-8')
for line in f.readlines():
match = re.match("(\d+):.*(.)\[得像\]",line)
if match == None:
print("search 2",line)
else:
ID = match.group(1)
cmp = match.group(2)
cmpDict[ID] = pin.get_pinyin(cmp)
#cmpDict[ID] = cmp.encode("GBK")
f.close()
mylist = sorted(cmpDict.items(),key=lambda item:item[1])
print(mylist)
f = open("out.txt", 'w', encoding='utf-8')
for ID,w in mylist:
f.write(str(corpus[ID]))
f.close()
```
汉字按照拼音排序
最新推荐文章于 2021-03-15 17:36:53 发布