一 得到原始文本内容
def FileRead(self,filePath):
f = open(filePath)
raw=f.read()
return raw
二 分割成句子
def SenToken(self,raw):#分割成句子
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(raw)
return sents
三 句子内容的清理,去掉数字标点和非字母字符
def CleanLines(self,line):
identify = string.maketrans('', '')
delEStr = string.punctuation +string.digits #ASCII 标点符号,数字
# cleanLine = line.translate(identify,delEStr) #去掉ASCII 标点符号和空格
cleanLine =line.translate(identify,delEStr) #去掉ASCII 标点符号
return cleanLine
四nltk.pos_tag进行词性标注
def POSTagger(self,sent):
taggedLine=[nltk.pos_tag(sent) for sent in sents]
return taggedLine
五 nltk.word_tokenize分词
def WordTokener(self,sent):#将单句字符串分割成词
result=''
wordsInStr = nltk.word_tokenize(sent)
return wordsInStr
六 enchant拼写检查