nltk简明教程
NLTK是python环境下NLP工具包,包含了丰富的文本处理和文本挖掘API。
安装
安装NLTK比较简单,linux环境下只需要简单的执行sudo pip install -U nltk
即可完成安装。
语料下载
import nltk
#指定目录下载nltk自带的英文语料
#如果不是使用的默认路径需要执行下面的语句添加环境变量:
#vim ~/.profile
#文件末尾添加NLTK_DATA="full/path"
#source ~/.profile
nltk.download(download_dir='./data/nltk/')
#在弹出GUI界面就可以选择下载的语料了
测试下载好的语料
from nltk.corpus import brown
print(brown.words()[0:10])#打印前10个单词
print(brown.tagged_words()[0:10])#打印前10个单词的标注
print(len(brown.words()))#有多少个单词
print(dir(brown))
测试下载好的书籍
from nltk.book import *
# *** Introductory Examples for the NLTK Book ***
# Loading text1, ..., text9 and sent1, ..., sent9
# Type the name of the text or sentence to view it.
# Type: 'texts()' or 'sents()' to list the materials.
# text1: Moby Dick by Herman Melville 1851
# text2: Sense and Sensibility by Jane Austen 1811
# text3: The Book of Genesis
# text4: Inaugural Address Corpus
# text5: Chat Corpus
# text6: Monty Python and the Holy Grail
# text7: Wall Street Journal
# text8: Personals Corpus
# text9: The Man Who Was Thursday by G . K . Chesterton 1908
print(text1.name)#书名
print(text1.concordance(word="love"))#上下文
print(text1.similar(word="very"))#相似上下文场景
print(text1.common_contexts(words=["pretty","very"]))#相似上下文
text4.dispersion_plot(words=[