import sys import os import json import re import numpy as np def PraseRawdata(author = None,constrain = None,src='./chinese-poetry/json/simplified', category="poet.tang"): def sentenceParse(para): res, num = re.subn(u'(.*)','',para) res,num = re.subn(u'{.*}','',res) res,num = re.subn(u'《.*》','',res) res,num = re.subn(u'[\]\[]','',res) r = '' for i in res: if i not in set('0123456789-'): r+=i r,num = re.subn(u'。。','。',r) return r def haddlejson(file): rst =[] data = json.loads(open(file).read()) for poetry in data: pdata ="" if(author is not None and poetry.get("author")!= author): return None p = poetry.get("paragraphs") flag = False for s in p: sp = re.split(u"[,!。]", s) for tr in sp: if constrain is not None and len(tr) != constrain and len(tr) != 0: flag = True break if flag: break if flag: continue for
一种经典的自然语言处理数据预处理方式
最新推荐文章于 2024-04-12 00:26:48 发布