第二章 获得文本语料和词汇资源
1 获取文本语料
古腾堡语料库 gutenberg
>>> import nltk
>>>nltk.corpus.gutenberg.fileids()
['austen-emma.txt',
'austen-persuasion.txt',
'austen-sense.txt',
'bible-kjv.txt',
'blake-poems.txt',
'bryant-stories.txt',
'burgess-busterbrown.txt',
'carroll-alice.txt',
'chesterton-ball.txt',
'chesterton-brown.txt',
'chesterton-thursday.txt',
'edgeworth-parents.txt',
'melville-moby_dick.txt',
'milton-paradise.txt',
'shakespeare-caesar.txt',
'shakespeare-hamlet.txt',
'shakespeare-macbeth.txt',
'whitman-leaves.txt']
#查看文本所含的单词数量
>>> emma=nltk.corpus.gutenberg.words("austen-emma.txt")
>>> len(emma)
192427
#在文本中年搜索单词
>>>emma=nltk.Text(nltk.corpus.gutenberg.words("austen-emma.txt"))
>>>emma.concordance("surprize")
Displaying 25 of 37 matches:
er father , was sometimes taken by surprizeat his being still able to pity `
hem do the other any good ." "You surprize me ! Emma must do Harriet good : a
Knightley actually looked red with surprizeand displeasure , as he stood up ,
r . Elton , and found to his great surprize, that Mr . Elton was actually on
d aid ." Emma saw Mrs . Weston ' ssurprize , and felt that it must be great ,
father was quite taken up with the surprizeof so sudden a journey , and his f
y , in all the favouring warmth of surprizeand conjecture . She was , moreove
he appeared , to have her share of surprize, introduction , and pleasure . Th
ir plans ; and it was an agreeable surprizeto her , therefore , to perceive t
talking aunt had taken me quite by surprize, it must have been the death of m
f all the dialogue which ensued of surprize, and inquiry , and congratulation
thepresent . They might chuse to surprize her ." Mrs . Cole had many to agre
the mode of it , the mystery , the surprize, is more like a young woman ' s s
toher song took her agreeably by surprize -- a second , slightly but correct
" " Oh ! no -- there is nothingto surprize one at all .-- A pretty fortune ;
t to be considered . Emma ' s only surprizewas that Jane Fairfax should accep
of your admiration may take you by surprizesome day or other ." Mr . Knightle
ation for her will ever take me by surprize.-- I never had a thought of her i
expected by the best judges , for surprize --but there was great joy . Mr .
sound of at first , without great surprize ." So unreasonably early !" she w
d Frank Churchill , with a look of surprizeand displeasure .-- " That is easy
; and Emma could imagine with what surprizeand mortification she must be retu
tled that Jane should go . Quite a surprizeto me ! I had not the least idea !
. Itis impossible to express our surprize . He came to speak to his father o
g engaged !" Emma even jumped withsurprize ;-- and , horror - struck , exclai
>>> from nltk.corpus importgutenberg
>>> for fileid ingutenberg.fileids():
... num_chars=len(gutenberg.raw(fileid)) # 文本中出现的词汇的个数,包含空格
... num_words=len(gutenberg.words(fileid)) #文本所含的单词数量
... num_sents=len(gutenberg.sents(fileid))#把文本划分成句子
... num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
... print(int(num_chars/num_words), #平均词长
int(num_words/num_sents),#平均句子长度
int(num_words/num_vocab),#每个词出现的平均次数
fileid)# 文件标识
...
运行结果:
4 24 26 austen-emma.txt
4 26 16 austen-persuasion.txt
4 28 22 austen-sense.txt
4 33 79 bible-kjv.txt
4 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 17 12 burgess-busterbrown.txt
4 20 12 carroll-alice.txt
4 20 11 chesterton-ball.txt
4 22 11 chesterton-brown.txt
4 18 10 chesterton-thursday.txt
4 20 24 edgeworth-parents.txt
4 25 15 melville-moby_dick.txt
4 52 10 milton-paradise.txt
4 11 8 shakespeare-caesar.txt
4 12 7 shakespeare-hamlet.txt
4 12 6 shakespeare-macbeth.txt
4 36 12 whitman-leaves.txt
网络和聊天文本
>>> from nltk.corpus importwebtext
>>> for fileid inwebtext.fileids():
... print(fileid,webtext.raw(fileid)[:65])
...
运行结果:
firefox.txt Cookie Manager: "Don'tallow sites that set removed cookies to se
grail.txt SCENE 1: [wind] [clop clop clop]
KING ARTHUR: Whoa there! [clop
overheard.txt White guy: So, do you haveany plans for this evening?
Asian girl
pirates.txt PIRATES OF THE CARRIBEAN: DEADMAN'S CHEST, by Ted Elliott & Terr
singles.txt 25 SEXY MALE, seeks attracolder single lady, for discreet encoun
wine.txt Lovely delicate, fragrant Rhonewine. Polished leather and strawb
#聊天记录
>>> from nltk.corpus import nps_chat
>>>chatroom=nps_chat.posts('10-19-20s_706posts.xml')
>>> chatroom[123]
['i', 'do', "n't", 'want', 'hot','pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']
布朗语料库
>>> from nltk.corpus import brown
>>> brown.categories() #词料库中的分类
['adventure', #探险
'belles_lettres', #纯文学
'editorial', #社论
'fiction', #小说
'government', #政府
'hobbies', #爱好
'humor', #幽默
'learned', #博览
'lore', #传说
'mystery', #推理小说
'news', #新闻
'religion',#宗教
'reviews', #评论
'romance', #言情
'science_fiction'] #科幻
#查看新闻类
>>> brown.words(categories='news')
['The', 'Fulton', 'County', 'Grand','Jury', 'said', ...]
#查看指定文件名的单词
>>> brown.words(fileids=['cg22'])
['Does', 'our', 'society', 'have', 'a','runaway', ',', ...]
#指定分类划分句子
>>> brown.sents(categories=['news','editorial','reviews'])
[['The', 'Fulton', 'County', 'Grand','Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's",'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence',"''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The','jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the','City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge','of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and','thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the','manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]
#对特定文体中的情态动词进行计数
>>>news_text=brown.words(categories='news')
>>> fdist=nltk.FreqDist([w.lower()for w in news_text])
>>>modals=['can','could','may','might','must','will']
>>> for m in modals:
... print(m+" : ",fdist[m])
...
can : 94
could : 87
may : 93
might : 38
must : 53
will : 389
#条件频率分布函数
>>> cfd=nltk.ConditionalFreqDist(
... (genre,word)
... for genre in brown.categories()
... for word inbrown.words(categories=genre))
>>>genres=['news','religion','hobbies','science_fiction','romance','humor']
>>>modals=['can','could','may','might','must','will']
>>> cfd.tabulate(conditions=genres,samples=modals)
can could may might must will
news 93 86 66 38 50 389
religion 82 59 78 12 54 71
hobbies 268 58 131 22 83 264
science_fiction 16 49 4 12 8 16
romance 74 193 11 51 45 43
humor 16 30 8 8 9 13
路透社语料库
>>> from nltk.corpus importreuters
>>> reuters.fileids()
#测试数据
['test/14826', 'test/14828', 'test/14829','test/14832', 'test/14833', 'test/14839', 'test/14840',
...
'test/21576',
#训练数据
'training/1', 'training/10','training/100', 'training/1000', 'training/10000', 'training/10002','training/10005', 'training/10008', 'training/10011',
...
'training/9995']
#主题分类,一则新闻可能涉及多个主题
>>> reuters.categories()
['acq', 'alum', 'barley', 'bop', 'carcass','castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper','copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl','dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut','groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest','ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil','livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha','nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium','palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand','rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship','silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal','sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil','wheat', 'wpi', 'yen', 'zinc']
#查找一个或多个文档涵盖的主题
>>> reuters.categories('training/9865')
['barley', 'corn', 'grain', 'wheat']
>>>reuters.categories(['training/9865','training/9880'])
['barley', 'corn', 'grain', 'money-fx','wheat']
#查找包含 一个或多个类别的文档
>>> reuters.fileids('barley')
['test/15618', 'test/15649', 'test/15676','test/15728', 'test/15871', 'test/15875', 'test/15952', 'test/17767','test/17769', 'test/18024', 'test/18263', 'test/18908', 'test/19275','test/19668', 'training/10175', 'training/1067', 'training/11208','training/11316', 'training/11885', 'training/12428', 'training/13099','training/13744', 'training/13795', 'training/13852', 'training/13856','training/1652', 'training/1970', 'training/2044', 'training/2171','training/2172', 'training/2191', 'training/2217', 'training/2232','training/3132', 'training/3324', 'training/395', 'training/4280','training/4296', 'training/5', 'training/501', 'training/5467','training/5610', 'training/5640', 'training/6626', 'training/7205','training/7579', 'training/8213', 'training/8257', 'training/8759', 'training/9865','training/9958']
>>>reuters.fileids(['barley','corn'])
['test/14832', 'test/14858', 'test/15033','test/15043', 'test/15106', 'test/15287', 'test/15341',
...
]
#查找我们相要的句子
>>>reuters.words('training/9865')[:14]
['FRENCH', 'FREE', 'MARKET', 'CEREAL','EXPORT', 'BIDS', 'DETAILED', 'French', 'operators', 'have', 'requested','licences', 'to', 'export']
>>>reuters.words(['training/9865','training/9880'])
['FRENCH', 'FREE', 'MARKET', 'CEREAL','EXPORT', ...]
>>> reuters.words(categories='barley')
['FRENCH', 'FREE', 'MARKET', 'CEREAL','EXPORT', ...]
>>>reuters.words(categories=['barley','corn'])
['THAI', 'TRADE', 'DEFICIT', 'WIDENS','IN', 'FIRST', ...]
就职演说语料库
>>> from nltk.corpus importinaugural
>>> inaugural.fileids()
['1789-Washington.txt','1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt','1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt','1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt','1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt','1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt','1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt','1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt','1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.txt','1905-Roosevelt.txt', '1909-Taft.txt', '1913-Wilson.txt', '1917-Wilson.txt','1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt','1933-Roosevelt.txt', '1937-Roosevelt.txt', '1941-Roosevelt.txt','1945-