1、词形还原
名字:复数->单数
动词:分词->原型
import nltk.stem as ns
words = ['table','probably','wolves','playing','is','dog','the','beaches','grounded','dreamt','envision']
lemmatizer = ns.WordNetLemmatizer()
for word in words:
lemma = lemmatizer.lemmatize(word,'n')
print(lemma)
print('-'*72)
for word in words:
lemma = lemmatizer.lemmatize(word,'v')
print(lemma)
2、词块划分
import nltk.corpus as nc
doc = ' '.join(nc.brown.words()[:310])
print(doc)
words = doc.split()
print(words)
chunks = []
for word in words:
if len(chunks) == 0 or len(chunks[-1])==5:
chunks.append([])
chunks[-1].append(word)
for chunk in chunks:
for word in chunk:
print('{:15}'.format(word),end='')
print()