目标:从txt文件中取出除了注释以外的一级中文,按拼音排序并输出
实现:1.需要 安装 pypinyin 模块 pip install pypinyin
代码如下
#coding:utf-8
import re
import os
from pypinyin import *
'''
去除重复的字符
'''
def str_duplication(str):
duplication_str = ''
for char in str:
if not char in duplication_str:
duplication_str += char
return duplication_str
'''
输出排序后的中文字符
'''
def output_sort_str(str):
data = ''
# 正则表达式
rule = '//.*|/\*([^\*]|(\*)*[^\*/])*(\*)*\*/'
# 去除str中的注释 将str中的注释用''替换
data = re.sub(rule, '', str)
# 使用正则表达获取中文
data = re.findall(r'[\u4e00-\u9fa5]', data)
data = str_duplication(data)
# print(data)
#排序
new_list = sorted(data, key=lambda data: lazy_pinyin(data,2)) # 按拼音对汉字进行排序
# new_list = sorted(data, key=lambda data: data.encode('utf-8')) # 按拼音对汉字进行排序
#转为字符串
new_str = ''.join(new_list)
return new_str
txt_name = 'oled.txt'#you filename
with open (txt_name,"r",encoding = "utf-8") as fp:
lines = fp.readlines()
# 将列表转换成字符串
str_data = ''.join(lines)
chinese_data = output_sort_str(str_data)
print(str_data)
输出结果
啊阿打分胡见卡看厘洛绿杉
Process finished with exit code 0