nltk 中的 sents 和 words ,为后续处理做准备。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from nltk.corpus import gutenberg
sents = gutenberg.sents("burgess-busterbrown.txt")
print(sents[1:20])
words = gutenberg.words("burgess-busterbrown.txt")
print(words[1:20])
输出:
[['I'], ['BUSTER', 'BEAR', 'GOES', 'FISHING'], ['Buster', 'Bear', 'yawned', 'as', 'he', 'lay', 'on', 'his', 'comfortable', 'bed', 'of', 'leaves', 'and', 'watched', 'the', 'first', 'early', 'morning', 'sunbeams', 'creeping', 'through', 'the', 'Green', 'Forest', 'to', 'chase', 'out', 'the', 'Black', 'Shadows', '.'], ['Once', 'more', 'he', 'yawned', ',', 'and', 'slowly', 'got', 'to', 'his', 'feet', 'and', 'shook', 'himself', '.'], ['Then',