具体实现的功能(如图所示),紧接着上一篇文档
代码如下
// An highlighted block
import os,jieba,jieba.posseg
from typing import List
Input_path = 'E:\\模式识别大作业\\XML\\'
Output_path = 'E:\\模式识别大作业\\txt\\'
def XML2txt2(load_XML, load_txt):
xml_filepath = os.path.abspath(load_XML)
f_XML=open(xml_filepath, mode='r', encoding='UTF-8')
words = []
tn=0
for i in f_XML.readlines():
for word in i.strip():
if '\u4e00' <= word <= '\u9fff': # if word > chr(127): 此代码是直接过滤出汉字而没有标点符号的
words.append(word)
tn=1
if tn:
words.append('\n')
tn=0
f_XML.close()
a = ''.join(words)
f_txt = open(load_txt, 'w', encoding='utf-8')
f_txt.write(a+'')
f_txt.close()
def XML2txt():
# file_names=list(os.walk(Input_path))
for i in list('ABCDEFGHJKLMNPR'):
load_XML = Input_path + 'LCMC_' + i + '.xml'
load_txt = Output_path + 'LCMC_' + i + '.txt'
XML2txt2(load_XML, load_txt)
load_XML = Input_path + 'LCMC_' + i + '.XML'
load_txt = Output_path + 'LCMC_' + i + '.TXT'
XML2txt2(load_XML, load_txt)
def HeBingWemBem():
with open(Output_path + 'Data_All.TXT', 'w+') as fo:
for i in list('ABCDEFGHJKLMNPR'):
with open(os.path.join(Output_path, 'LCMC_' + i + '.txt'), 'r') as fi:
l = fi.readlines()
fo.writelines(str(l))
def ShengChengCiKu():
words=set()
f_Dict=open(Output_path + 'Words_Dict.TXT','w+')
with open(Output_path + 'Data_All.TXT', 'r') as f:
Lib=f.read()
jieba.enable_parallel(10)
for x in jieba.cut(Lib):
CiXing=jieba.posseg.cut(x)
if len(x) > 1 :
words.add(x)
jieba.disable_parallel()
f_Dict.write(str(words))
f_Dict.close()
if __name__ == '__main__':
XML2txt()
HeBingWemBem()
ShengChengCiKu()
print()