记录一下代码:
"""
__author__:shuangrui Guo
__description__:
"""
import sys
import nltk
import json
from tqdm import tqdm
#多进程的包
import multiprocessing
import argparse
import os
import re
SUFFIX_NLTK = '__nltk.json'
#清洗文本
def clean_text(text):
text = re.sub(r'[^\x00-\x7F]+',' ',text)
text = re.sub(r"([.,!:?()])",r" \1 ",text)
text = re.sub(r"\s{2,}"," ",text)
text = text.replace("-"," ")
return text
#获取文件行数的函数
def get_line_count(inFile):
lines = 0
with open(inFile,'r') as f:
while f.readline():
lines+=1
return lines
#跳过所有的单个词,默认是True
def get_nps_from_tree(tree, words_original, attachNP=False, skip_single_word=True):
nps = []
st = 0
for subtree in tree:
if isinstance(subtree, nltk.tree.Tree):
if subtree.label() == 'NP':
np = subtree.leaves()
ed = st + len(np)
if not skip_single_word or len(np) > 1:
nps.append({'st': st,
'ed': ed,
'text': ' '.join(words_original[st:ed])})
if attachNP:
nps[-1]['np'] = np
st += len(subtree.leaves())
else:
st += 1
return nps
def validate_nps(nps, words_original):
validated_nps = []
for np in sorted(nps, key=lambda x:x['st']):
st = np['st']
ed = np['ed']
token_span = words_original[st:ed]
# 'A polynomial time algorithm for the Lambek calculus with brackets of bounded order'
if ' '.join(token_span).strip() != np['text'].strip():
print(' '.join(token_span))
print(np)
return validated_nps
validated_nps.append(np)
return nps
def get_nps_nltk_raw(doc):
# 预先定义的分块语法,具体含义不清楚
GRAMMAR = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns(名次和形容词,并且以名词结尾)
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
# 定义语法解析器
_PARSER = nltk.RegexpParser(GRAMMAR)
doc = clean_text(doc)
#对文档使用空格切分
words_original = nltk.word_tokenize(doc)
#words_original = doc.split(' ')
try:
parse_tree = _PARSER.parse(nltk.pos_tag(words_original))
pass
except Exception as e:
import ipdb; ipdb.set_trace()
pass
nps = get_nps_from_tree(parse_tree, words_original)
return nps
#读入与写出文件
def writeToJson(inFile, outFile):
#分别读入文件,与写出文件
with open(inFile, 'r') as fin, open(outFile, 'w') as fout:
total = get_line_count(inFile)
for line in tqdm(fin, total=total):
doc = line.strip('\r\n')
#对每一行进行处理
if doc:
nps = get_nps_nltk_raw(doc)
else:
nps = []
fout.write(json.dumps(nps))
fout.write('\n')
if __name__ == '__main__':
inFile = "./patent_abstract.txt"
outFile = inFile + SUFFIX_NLTK
writeToJson(inFile, outFile)