功能实现:![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/57918ff23743ef6a3a9d8d6ab81b0624.jpeg)
后面的显示不出来了,凑和着看吧。
代码如下
// An highlighted block
import jieba
import os.path as osp
def split_word(Input_path):
content_all = []
for i in list('ABCDEFGHJKLMNPR'): # ABCDEFGHJKLMNPR
print(i)
load_XML = osp.join(Input_path, 'LCMC_' + i + '.txt')
content = [] # 存放读取的内容
content_set = [set('')]
key_content = [set('')]
length = 0 # length 表示单个文档总行数
base_length = 7 # 表示基准长度 paragraph_length 表示段落长度
with open(load_XML, 'r', encoding='UTF-8') as file_read:
for j in file_read.readlines():
content.append(j)
length = length + 1
paragraph_length = int(length / base_length)
mod = length % base_length
print(length, paragraph_length, mod)
if mod > 0:
for k in range(mod):
content[-(k + base_length)] = content[-(k + base_length)] + content[-k]
for m in range(paragraph_length):
a = []
b = ''
for n in range(base_length):
a.append(content[m * base_length + n])
b = b + content[m * base_length + n]
content_set.append(set(a))
result_jieba = list(set(jieba.cut(b)))
for word in result_jieba:
if len(word) < 2:
result_jieba.remove(word)
words = set(result_jieba)
key_content.append(words)
for key_content_singe in key_content:
key_content[0] = key_content[0] | key_content_singe
print(content_set)
print(key_content)
result = [content_set, key_content] # 表示了文档的两个返回结果 类型是list
content_all.append(result)
return content_all
if __name__ == '__main__':
Input_path = 'E:\\模式识别大作业\\txt'
# Output_path = ''
A = split_word(Input_path)
print(A)