# -*- coding: utf-8 -*-
import os
import json
import re
dir_source_name = 'FullText2'
dir_source_name_new = dir_source_name + '_new'
root_path = os.path.abspath(os.path.dirname(__file__))
json_path = os.path.join(root_path, dir_source_name)
json_path_new = os.path.join(root_path, dir_source_name_new)
max_len = 510
window_size = 64
def write_file_new(content, path):
with open(path, 'w', encoding='utf-8') as fp:
json.dump(content, fp, ensure_ascii=False)
def read_files(path, path_new):
"""
pass
"""
if not os.path.isdir(dir_source_name_new):
os.makedirs(dir_source_name_new)
files = os.listdir(path) # 得到文件夹下所有文件的名称
for file in files:
filepath = path + '/' + file
filepath_new = path_new + '/' + file.split('.')[0]
if os.path.isfile(filepath): # 如果是文件就打开,需要注意os.path.isfile需要传入绝对地址
with open(filepath, 'r', encoding='UTF-8') as js_file:
js = json.load(js_file)
text = js['text'].replace('\n', '。')
text = re.sub(r'\s', '', text)
text_len = len(text)
num = text_len
if num == 0:
js['text'] = text
write_file_new(js, filepath_new)
else:
for i in range(0, 2 * num):
if (i + 1) * max_len - i * window_size > text_len:
js['text'] = text[(i * max_len - i * window_size): text_len]
write_file_new(js, filepath_new + '_' + str(i) + '.json')
break
else:
js['text'] = text[(i * max_len - i * window_size): (i + 1) * max_len - i * window_size]
write_file_new(js, filepath_new + '_' + str(i) + '.json')
read_files(json_path, json_path_new)
# os.mkdir()创建路径中的最后一级目录,而如果之前的目录不存在并且也需要创建的话,就会报错。
# os.makedirs()创建多层目录,如果中间目录都不存在的话,会自动创建。