from __future__ import division
from nltk.book import *
# 在text1中找到相关词,并显示上下文
text1.concordance("monstrous")
# 找到与其用法意义相似的词
text1.similar("monstrous")
# 找与该集合相似的词
text1.common_contexts(["monstrous","very"])
# 画出各个词的离散图
text4.dispersion_plot(["citizens","democracy","freedom"])
# 产生错误 由于版本问题
text3.generate()
# 计算平均每个词的出现频率
print(len(text3)/len(set(text3)))
# 对文中出现该词计数
print(text3.count("smote"))
# 文本索引
print(text3.index("smote"))
print(text3[9073])
# 文本切片
print(text3[123:178])
# 字符串
name="python"
a=" ".join(['hello','python'])
print(a)
a.split(" ")
print(a)
# 频率统计
fdist1=FreqDist(text1)
print(fdist1)
vocabulary1=fdist1.keys()
print(vocabulary1)
fdist1.plot(50,cumulative=True)
# 细粒度的选择词
V=set(text1)
fdist1=FreqDist(text1)
long_words=[w for w in V if len(w)>15 and fdist1[w]>2]
print(long_words)
# 搭配词和双连词
print(list(bigrams(['more','is','than','done'])))
print(text1.collocations())
import nltk
# 人机对话
nltk.chat.chatbots()
import nltk
# 捕捉用户的输入
# 没有raw_input了
s=input("enter some text")
print("you typed",len(nltk.word_tokenize(s)),"words")
# 正则表达式
import re
import nltk
wordlist=[w for w in nltk.corpus.words.words('en') if w.islower()]
# print(wordlist)
# 查找ed结尾的词汇
print([w for w in wordlist if re.search('ed$',w)])
# ^表示开头 $表示接受
print([w for w in wordlist if re.search('^..j..t..$',w)])
# 范围与闭包,根据按键顺序决定的
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$',w)])
# 利用正则表达式提取字符块
word="asdnsjndnsudndjkdn"
print(re.findall(r'[aeiou]',word))
print(len(re.findall(r'[aeiou]',word)))
# 文本换行
from textwrap import fill
saying=['After','all','is','said','and','done']
format='%s(%d)'
pieces=[format % (word,len(word)) for word in saying]
output=' '.join(pieces)
wrapped=fill(output)
print(wrapped)
# 可视化词之间的关系
import networkx as nx
import matplotlib
from nltk.corpus import wordnet as wn
def traverse(graph,start,node):
graph.depth[node.name]=node.shortest_path_distance(start)
for child in node.hyponyms():
graph.add_edge(node.name,child.name)
traverse(graph,start,child)
def hyponym_graph(start):
G=nx.Graph()
G.depth={}
traverse(G,start,start)
return G
def graph_draw(graph):
nx.draw(graph,
node_size=[16*graph.degree(n) for n in graph],
node_color=[graph.depth[n] for n in graph],
with_labels=False)
matplotlib.pyplot.show()
dog=wn.synset('dog.n.01')
graph=hyponym_graph(dog)
graph_draw(graph)
# 一元标注 unigram tagging
# 训练
import nltk
from nltk.corpus import brown
brown_tagged_sents=brown.tagged_sents(categories='news')
brown_sents=brown.sents(categories='news')
unigram_tagger=nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])
print(unigram_tagger.evaluate(brown_tagged_sents))
# 分离训练
size=int(len(brown_tagged_sents)*0.9)
train_sents=brown_tagged_sents[:size]
test_sents=brown_tagged_sents[size:]
unigram_tagger=nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)
# 组合标注器
t0=nltk.DefaultTagger('NN')
t1=nltk.UnigramTagger(train_sents,backoff=t0)
t2=nltk.BigramTagger(train_sents,backoff=t1)
t2.evaluate(test_sents)
# 存储标注器 python3 是pickle
from pickle import dump
output=open('t2.pkl','wb')
dump(t2,output,-1)
output.close()
from pickle import load
input=open('t2.pkl','rb')
tagger=load(input)
input.close()
text="I'm a small girl in a big world"
tokens=text.split()
print(tagger.tag(tokens))
# 性别鉴定
import nltk
nltk.download('names')
# 返回的字典成为特征集 特征提取器
def gender_features(word):
return {'last_letrer':word[-1]}
print(gender_features('Shrek'))
# 准备例子和对应类标签的链表
from nltk.corpus import names
import random
names=([(name,'male') for name in names.words('male.txt')]+
[(name,'female') for name in names.words('female.txt')])
random.shuffle(names)
# 用特征提取器处理名称数据,划分训练集和测试集,用于训练一个“朴素贝叶斯分类器
featuresets=[(gender_features(n),g) for (n,g) in names]
train_set,test_set=featuresets[500:],featuresets[:500]
classifier=nltk.NaiveBayesClassifier.train(train_set)
print(classifier.classify(gender_features('lucy')))
# 显示最强特征
classifier.show_most_informative_features(5)
# 文档分类
# 选择电影评论语料库
import random
import nltk
from nltk.corpus import movie_reviews
documents=[(list(movie_reviews.words(fileid)),category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
# 构建频繁词列表
all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features=list(all_words.keys())[:2000]
# 定义特征提取器
def document_features(document):
document_words=set(document)
features={}
for word in word_features:
features['contains(%s)' %word]=(word in document_words)
return features
# print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
# 训练和测试一个分类器进行文档分类
featuresets=[(document_features(d),c) for (d,c) in documents]
train_set,test_set=featuresets[100:],featuresets[:100]
classifier=nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))
classifier.show_most_informative_features(5)
# 基于上下文语境进行词汇标注
def pos_features(sentence,i):
features={"shuffix(1)":sentence[i][-1:],
"shuffix(2)": sentence[i][-2:],
"shuffix(3)": sentence[i][-3:]
}
if i==0:
features["pre-word"]="<START>"
else:
features["pre-word"]=sentence[i-1]
return features
print(pos_features("I'm a small girl in a big world",4))
# 使用连续分类器进行词性标注
import nltk
def pos_features(sentence, i, history):
features = {"shuffix(1)": sentence[i][-1:],
"shuffix(2)": sentence[i][-2:],
"shuffix(3)": sentence[i][-3:]
}
if i == 0:
features["pre-word"] = "<START>"
features["pre-tag"] = "<START>"
else:
features["pre-word"] = sentence[i - 1]
return features
class ConsecutivePosTagger(nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent)
featureset = pos_features(untagged_sent, i, history)
train_set.append((featureset, tag))
history.append(tag)
self.classifier = nltk.NaiveBayesClassifier.train(train_set)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = pos_features(sentence, i, history)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
# 使用连续分词器对名词短语分块
import nltk
class ConsecutiveNPChunkTagger(nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
# 这里报错
featureset = npchunk_features(untagged_sent, i,history)
train_set.append( (featureset, tag))
history.append(tag)
self.classifier = nltk.MaxentClassifier.train(rain_set, algorithm = 'megam', trace = 0)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = npchunk_features(sentence, i, history)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
class ConsecutiveNPChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
def parse(self, sentence):
tagged_sents = self.tagger.tag(sentence)
conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
return nltk.chunk.conlltags2tree(conlltags)
def npchunk_features(sentence,i,history):
word.pos=sentence[i]
return {"pos":pos}
chunker = ConsecutiveNPChunker(train_sents)
print chunker.evaluate(test_sents)
# 命名实体识别
import nltk
nltk.download('maxent_ne_chunker')
sent=nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent,binary=True))
#关系抽取
import nltk
import re
IN=re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
for rel in nltk.sem.extract_rels('ORG','LOC',doc,corpus='ieer',pattern=IN):
# 这里出错
print(nltk.sem.relextract(rel))