源文件的存储格式为:一级目录(dirname)/二级目录(dirList)/具体文件(fileList)。
重新写入后的数据分别被存储在train.txt;test.txt;val.txt文件中,文件中内容的具体格式为(category,content).
代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
将文本整合到 train、test、val 三个文件中
"""
import os
def _read_file(filename):
"""读取一个文件并转换为一行"""
with open(filename, 'r', encoding='utf-8') as f:
return f.read().replace('\n', '').replace('\t', '').replace('\u3000', '')
def save_file(dirname):
"""
将多个文件整合并存到3个文件中
dirname: 原数据目录
文件内容格式: 类别\t内容
"""
f_train = open('data/cnews/cnews.train.txt', 'w', encoding='utf-8')
f_test = open('data/cnews/cnews.test.txt', 'w', encoding='utf-8')
f_val = open('data/cnews/cnews.val.txt', 'w', encoding='utf-8')
for category in os.listdir(dirname): # 分类目录
cat_dir = os.path.join(dirname, category) #cat_dir是一级目录下的二级目录名称
if not os.path.isdir(cat_dir):
continue
files = os.listdir(cat_dir) #files是一级目录下的二级目录
count = 0
for cur_file in files:
filename = os.path.join(cat_dir, cur_file) #filename是二级目录下的具体文件名称
content = _read_file(filename)
if count < 5000:
f_train.write(category + '\t' + content + '\n')
elif count < 6000:
f_test.write(category + '\t' + content + '\n')
else:
f_val.write(category + '\t' + content + '\n')
count += 1
print('Finished:', category)
f_train.close()
f_test.close()
f_val.close()
if __name__ == '__main__':
save_file('data/THUCNews')
print(len(open('data/cnews/cnews.train.txt', 'r', encoding='utf-8').readlines()))
print(len(open('data/cnews/cnews.test.txt', 'r', encoding='utf-8').readlines()))
print(len(open('data/cnews/cnews.val.txt', 'r', encoding='utf-8').readlines()))
参考:https://github.com/gaussic/text-classification-cnn-rnn