词性标注数据预处理

最新推荐文章于 2021-11-27 16:10:57 发布

张一爻

最新推荐文章于 2021-11-27 16:10:57 发布

阅读量443

点赞数

分类专栏： python代码整合

本文链接：https://blog.csdn.net/weixin_43069769/article/details/107877487

版权

python代码整合专栏收录该内容

115 篇文章 17 订阅

订阅专栏

import numpy as np
import torch
from IPython.display import display, Image
import os
from os import listdir
import re
from tqdm import tqdm

def list_file_path(dirpath):
    return [os.path.join(dirpath,dir) for dir in listdir(dirpath)]

def replace_lambda(strings,symbles=[' ','\ufeff'],Replace_the_symbol="\n"):
    srcrep = {i:Replace_the_symbol for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

def clean_page(filepath):
    file = open(filepath,'r').readlines()
    sentence_tagging = ''.join([replace_lambda('{}{}'.format('sentence/sentence',sentence)) for sentence in file])
    split_pos_tagging = re.split(r'\n',sentence_tagging)
    remove_empty = list(map(lambda x : x.split('/') ,list(filter(lambda x : x!='' , split_pos_tagging))))
    return np.array([['page','page']]+remove_empty) 

def load_pos_tags(dirpath):
    init_pos_tage = []
    cl = []
    for dir in tqdm(listdir(dirpath),desc='加载词性标注文件'):
        one_page = clean_page(os.path.join(dirpath,dir))
        for pos in one_page:
            if len(pos) == 2:
                init_pos_tage.append(pos)
            else:
                cl.append(pos)
    return np.array(init_pos_tage),cl

dirpath = "/NLP数据集合/词性标注数据集/国家语委人工词性标注语料"