import numpy as np
import torch
from IPython.display import display, Image
import os
from os import listdir
import re
from tqdm import tqdm
def list_file_path(dirpath):
return [os.path.join(dirpath,dir) for dir in listdir(dirpath)]
def replace_lambda(strings,symbles=[' ','\ufeff'],Replace_the_symbol="\n"):
srcrep = {i:Replace_the_symbol for i in symbles }
rep = dict((re.escape(k), v) for k, v in srcrep.items())
pattern = re.compile("|".join(rep.keys()))
return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)
def clean_page(filepath):
file = open(filepath,'r').readlines()
sentence_tagging = ''.join([replace_lambda('{}{}'.format('sentence/sentence',sentence)) for sentence in file])
split_pos_tagging = re.split(r'\n',sentence_tagging)
remove_empty = list(map(lambda x : x.split('/') ,list(filter(lambda x : x!='' , split_pos_tagging))))
return np.array([['page','page']]+remove_empty)
def load_pos_tags(dirpath):
init_pos_tage = []
cl = []
for dir in tqdm(listdir(dirpath),desc='加载词性标注文件'):
one_page = clean_page(os.path.join(dirpath,dir))
for pos in one_page:
if len(pos) == 2:
init_pos_tage.append(pos)
else:
cl.append(pos)
return np.array(init_pos_tage),cl
dirpath = "/NLP数据集合/词性标注数据集/国家语委人工词性标注语料"
词性标注数据预处理
最新推荐文章于 2021-11-27 16:10:57 发布