简单例子
# -*- coding: utf-8 -*-
"""
http://www.nltk.org/
首页示例
"""
import nltk
# Tokenize and tag some text:
sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
tokens = nltk.word_tokenize(sentence)
print tokens
tagged = nltk.pos_tag(tokens)
print tagged[0:6]
# Identify named entities:
entities = nltk.chunk.ne_chunk(tagged)
print entities
# Display a parse tree:
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()
# NLTK中文语料库 sinica_treebank
from nltk.corpus import sinica_treebank
sinica_text = nltk.Text(sinica_treebank.words())
print sinica_text
for (key, var) in sinica_treebank.tagged_words()[:8]:
print '%s%s' % (key, var),
# NLTK中文句法树
sinica_treebank.parsed_sents()[15].draw()
演示使用NLTK让计算机学习如何通过名字识别性别
# -*- coding: utf-8 -*-
"""
演示使用NLTK让计算机学习如何通过名字识别性别。
"""
import nltk
# 定义学习方法
def gender_features(word):
return {'last_letter':word[-1]}
# 导入学习的姓名性别名单
from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
# 开始学习
f = [(gender_features(n), g) for (n, g) in names]
trainset, testset = f[500:], f[:500]
c = nltk.NaiveBayesClassifier.train(trainset)
# 测试
print c.classify(gender_features('Neo'))
print c.classify(gender_features('Trinity'))