Python 2.7.5 (default, Aug 4 2017, 00:39:18)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-16)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> raw="""DENNIS:Listern,strange women lying in ponds distributing swords... is no basis for a system of government. Supreme executive power derives from... a mandate from the masses,not from some farcical aquatic ceremony."""
>>> tokens=nltk.word_tokenize(raw)
>>> wnl=nltk.WordNetLemmatizer()
>>> [wnl.lemmatize(t) for t in tokens]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/site-packages/nltk/stem/wordnet.py", line 40, in lemmatize
lemmas = wordnet._morphy(word, pos)
File "/usr/lib/python2.7/site-packages/nltk/corpus/util.py", line 116, in __getattr__
self.__load()
File "/usr/lib/python2.7/site-packages/nltk/corpus/util.py", line 81, in __load
except LookupError: raise e
LookupError:
**********************************************************************
Resource wordnet not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('wordnet')
Searched in:
- '/root/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- '/usr/nltk_data'
- '/usr/lib/nltk_data'
**********************************************************************
>>> nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/wordnet.zip.
True
>>> [wnl.lemmatize(t) for t in tokens]
['DENNIS', ':', 'Listern', ',', 'strange', u'woman', 'lying', 'in', u'pond', 'distributing', u'sword', '...', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', '...', 'a', 'mandate', 'from', 'the', u'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']
词形归并的意思是删除因为词缀而产生的词。
这里没有处理lying