NLTK提取全文词干,首先读入整个文件的内容,然后切分句子和单词,最后提取每个单词的词干。Python 2.x 代码如下所示:
import nltk
class My_Tokenizer():
def __init__(self):
with open(infile_path, 'r') as myfile:
content = myfile.read()
sentences = [nltk.word_tokenize(sent) for sent in sentences]
for sent in sentences:
# Lancaster
lwords = [lancaster.stem(t) for t in sent]
print lwords
tokenizer = My_Tokenizer()