emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))print(emma)print(emma.concordance("surprize"))#获取指定词汇出现的文章上下文
统计语料库的平均词长,平均句长,词语的多样性
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars =len(gutenberg.raw(fileid))#以字母为单位进行统计
num_words =len(gutenberg.words(fileid))#以单词分隔符为单位进行统计
num_sents =len(gutenberg.sents(fileid))#以句子为单位进行统计
num_vocab =len(set([w.lower()for w in gutenberg.words(fileid)]))#去重转化为小写在进行统计(相同词汇可能因为大小写不同而重复)#平均词长,平均句长, 词汇的多样性,文档名print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab), fileid)
sents方法生成的是词链表,输出最长句子
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')#生成的是词链表
longest_len =max([len(s)for s in macbeth_sentences])#求出最长句子长度print([s for s in macbeth_sentences iflen(s)== longest_len])#最长句子写了些什么
2.网络和聊天语料库
# #网络和聊天文本from nltk.corpus import webtext
webtext.fileids()for fileid in webtext.fileids():print(fileid, webtext.raw(fileid)[:65],'...')
3.布朗语料库(第一个百万级的电子语料库)-用于研究不同文体之间的差异
from nltk.corpus import brown
print(brown.categories())#输出文本分类print(brown.words(categories='news'))print(brown.words(fileids=['cg22']))print(brown.sents(categories=['news','editorial','reviews']))
查看某一分类中情态动词的词频
import nltk
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower()for w in news_text])
modals =['can','could','may','might','must','will']#形成想要查询的情态动词表for m in modals:print(m +':', fdist[m],)#查看情态动词的词频
输出不同文体之间情态动词的使用差异
import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(#创建条件频率分布函数(genre, word)#文体 + 单词for genre in brown.categories()for word in brown.words(categories=genre))
genres =['news','religion','hobbies','science_fiction','romance','humor']
modals =['can','could','may','might','must','will']
cfd.tabulate(conditions=genres, samples=modals)#画出一个交叉表
4.路透社语料库
from nltk.corpus import reuters
print(len(reuters.fileids()))#统计文档数目print(len(reuters.categories()))#统计文档分类数目
reuters.categories('training/9865')
reuters.categories(['training/9865','training/9880'])
reuters.fileids('barley')#输出分类文档
5.美国不同年份总统就职演讲语料库
from nltk.corpus import inaugural
import nltk
inaugural.fileids()print([fileid[:4]for fileid in inaugural.fileids()])#取出文件相应的就职年份#
cfd = nltk.ConditionalFreqDist(#特定词汇在不同就职演讲中出现的次数变化(target, fileid[:4])for fileid in inaugural.fileids()for w in inaugural.words(fileid)for target in['america','citizen']if w.lower().startswith(target))
cfd.plot()
6.载入自己的语料库
from nltk.corpus import PlaintextCorpusReader
corpus_root ='./training'#文件目录
wordlists = PlaintextCorpusReader(corpus_root,['pku_training.utf8','cityu_training.utf8','msr_training.utf8','pku_training.utf8'])#文件载入可以使用正则表达式进行构建文件名列表print(wordlists.fileids())print(wordlists.raw('pku_training.utf8'))print(len(wordlists.words('pku_training.utf8')))print(len(wordlists.sents('pku_training.utf8')))
7.条件频率分布,以及绘制分布图,分布表
创建条件概率分布
from nltk.corpus import brown
import nltk
genre_word =[(genre, word)for genre in['news','romance']for word in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(genre_word)print(cfd.items())
绘制分布图
from nltk.corpus import inaugural
import nltk
cfd = nltk.ConditionalFreqDist((target, fileid[:4])for fileid in inaugural.fileids()for w in inaugural.words(fileid)for target in['america','citizen']if w.lower().startswith(target))
cfd.plot()
绘制分布表
from nltk.corpus import udhr
import nltk
languages =['Chickasaw','English','German_Deutsch','Greenlandic_Inuktikut','Hungarian_Magyar','Ibibio_Efik']
udhr.fileids()
cfd = nltk.ConditionalFreqDist((lang,len(word))for lang in languages
for word in udhr.words(lang +'-Latin1'))
cfd.tabulate(conditions=['English','German_Deutsch'],#如果不设定就是默认都显示
samples=range(10), cumulative=True)#限制显示样本的数目
8.使用双连词生成随机文本
import nltk
sent =['In','the','beginning','God','created','the','heaven','and','the','earth','.']print([i for i in nltk.bigrams(sent)])#生成的是生成器defgenerate_model(cfdist, word, num=15):for i inrange(num):print(word)
word = cfdist[word].max()#下一个最大概率的连接词
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)#二连接一篇文章# print([i for i in bigrams])
cfd = nltk.ConditionalFreqDist(bigrams)#建立条件分布图
generate_model(cfd,'living')#生成随机文章,不断输出二连次中下一个最有可能的词语