import sys import os import json import re import numpy as np def PraseRawdata(author = None,constrain = None,src='./chinese-poetry/json/simplified', category="poet.tang"): def sentenceParse(para): res, num = re.subn(u'(.*)','',para) res,num = re.subn(u'{.*}','',res) res,num = re.subn(u'《.*》','',res) res,num = re.subn(u'[\]\[]','',res) r = '' for i in res: if i not in set('0123456789-'): r+=i r,num = re.subn(u'。。','。',r) return r def haddlejson(file): rst =[] data = json.loads(open(file).read()) for poetry in data: pdata ="" if(author is not None and poetry.get("author")!= author): return None p = poetry.get("paragraphs") flag = False for s in p: sp = re.split(u"[,!。]", s) for tr in sp: if constrain is not None and len(tr) != constrain and len(tr) != 0: flag = True break if flag: break if flag: continue for
一种经典的自然语言处理数据预处理方式
最新推荐文章于 2024-07-20 19:16:01 发布
该博客介绍了如何处理中文诗歌数据,包括解析JSON文件,进行文本清理,预处理序列,以及使用Keras的`pad_sequences`进行序列填充,确保所有序列长度一致。此外,还涉及到字典映射和数据保存为二进制文件。
摘要由CSDN通过智能技术生成