NLTK03 《Python自然语言处理》code02 获得文本语料和词汇资源

02 获得文本语料和词汇资源

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 02 获得文本语料和词汇资源
# pnlp02.py

# 2.1 获取文本语料库
# 古滕堡语料库
import nltk
gtb = nltk.corpus.gutenberg.fileids()
print(gtb)
'''
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 
 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 
 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 
 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 
 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
'''

emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print(len(emma)) # 192427

emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
print(emma.concordance("surprize"))
'''
Displaying 25 of 37 matches:
er father , was sometimes taken by surprize at his being still able to pity ` 
...
g engaged !" Emma even jumped with surprize ;-- and , horror - struck , exclai
None
'''

from nltk.corpus import gutenberg
print(gutenberg.fileids())
'''
['austen-emma.txt', 'austen-persuasion.txt', ..., 'whitman-leaves.txt']
'''

# 计算平均词长、平均句子长度和文本中每个词出现的平均次数
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print(int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid)
'''
4 24 26 austen-emma.txt
4 26 16 austen-persuasion.txt
4 28 22 austen-sense.txt
4 33 79 bible-kjv.txt
4 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 17 12 burgess-busterbrown.txt
4 20 12 carroll-alice.txt
4 20 11 chesterton-ball.txt
4 22 11 chesterton-brown.txt
4 18 10 chesterton-thursday.txt
4 20 24 edgeworth-parents.txt
4 25 15 melville-moby_dick.txt
4 52 10 milton-paradise.txt
4 11 8 shakespeare-caesar.txt
4 12 7 shakespeare-hamlet.txt
4 12 6 shakespeare-macbeth.txt
4 36 12 whitman-leaves.txt
'''

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
print(macbeth_sentences)
# [['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...]
print(macbeth_sentences[1037])
# ['Good', 'night', ',', 'and', 'better', 'health', 'Attend', 'his', 'Maiesty']
longest_len = max([len(s) for s in macbeth_sentences])
l1 = [s for s in macbeth_sentences if (len(s) == longest_len)]
print(l1)
'''
[['Doubtfull', 'it', 'stood', ',', 'As', 'two', 'spent', 'Swimmers', ',', 'that', 'doe', 'cling', 
...
'Head', 'vpon', 'our', 'Battlements']]
'''

# 网络和聊天文本
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')
'''
firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ...
grail.txt SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Whoa there!  [clop ...
overheard.txt White guy: So, do you have any plans for this evening?
Asian girl ...
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ...
singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ...
wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ...
'''

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom)
'''[['now', 'im', 'left', 'with', 'this', 'gay', 'name'], [':P'], ...]'''

# 布朗语料库
from nltk.corpus import brown
print(brown.categories())
'''
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 
'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
'''
print(brown.words(categories='news'))
'''['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]'''
print(brown.words(fileids=['cg22']))
'''['Does', 'our', 'society', 'have', 'a', 'runaway', ',', ...]'''
print(brown.sents(categories=['news', 'editorial', 'reviews']))
'''
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 
'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 
'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 
'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', 
',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 
'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]
'''

from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', fdist[m])
'''
can: 94
could: 87
may: 93
might: 38
must: 53
will: 389
'''

cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
res = cfd.tabulate(conditions=genres, samples=modals)
print(res)
'''
                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 
None
'''

# 透露社语料库
from nltk.corpus import reuters
print(reuters.fileids())
'''
['test/14826', 'test/14828', 'test/14829', ..., 'training/9994', 'training/9995']
'''
print(reuters.categories())
'''
['acq', 'alum', 'barley', 'bop', ..., 'wpi', 'yen', 'zinc']
'''
print(reuters.categories('training/9865'))
# ['barley', 'corn', 'grain', 'wheat']
print(reuters.categories(['training/9865', 'training/9880']))
# ['barley', 'corn', 'grain', 'money-fx', 'wheat']
print(reuters.fileids('barley'))
'''
['test/15618', 'test/15649', 'test/15676', ..., 'training/9865', 'training/9958']
'''
print(reuters.fileids(['barley', 'corn']))
'''
['test/14832', 'test/14858', 'test/15033', ..., 'training/9958', 'training/9989']
'''
print(reuters.words('training/9865')[:14])
'''
['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', 'BIDS', 'DETAILED', 'French', 'operators', 'have', 
'requested', 'licences', 'to', 'export']
'''
print(reuters.words(['training/9865', 'training/9880']))
'''['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]'''
print(reuters.words(categories='barley'))
'''['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]'''
print(reuters.words(categories=['barley', 'corn']))
'''['THAI', 'TRADE', 'DEFICIT', 'WIDENS', 'IN', 'FIRST', ...]'''

# 就职演说语料库
from nltk.corpus import inaugural
print(inaugural.fileids())
'''['1789-Washington.txt', '1793-Washington.txt', ..., '2005-Bush.txt', '2009-Obama.txt']'''
res = [fileid[:4] for fileid in inaugural.fileids()]
print(res)
'''['1789', '1793', '1797', ..., '2005', '2009']'''
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
)
cfd.plot()

# 标注文本预料库

# 其他语料库
print(nltk.corpus.cess_esp.words())
'''['El', 'grupo', 'estatal', 'Electricité_de_France', ...]'''
print(nltk.corpus.floresta.words())
'''['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...]'''
print(nltk.corpus.indian.words('hindi.pos'))
'''['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक', 'संयुक्त', ...]'''
print(nltk.corpus.udhr.fileids())
'''['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8', ..., 'Zhuang-Latin1', 'Zulu-Latin1']'''
print(nltk.corpus.udhr.words('Javanese-Latin1')[11:])
'''['Saben', 'umat', 'manungsa', 'lair', 'kanthi', 'hak', ...]'''

from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1')
)
cfd.plot(cumulative=True)

# 文本语料库结构
# help(nltk.corpus.reader)
raw = gutenberg.raw("burgess-busterbrown.txt")
print(raw[1:20]) # The Adventures of B
words = gutenberg.words("burgess-busterbrown.txt")
print(words[1:20])
'''['The', 'Adventures', 'of', 'Buster', ..., 'Bear']'''
sents = gutenberg.sents("burgess-busterbrown.txt")
print(sents[1:20])
'''[['I'], ['BUSTER', 'BEAR', 'GOES', 'FISHING'], ..., 'for', 'breakfast', '.']]'''

# 加载自己的语料库
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'D:/tmp/tensorflow/data'
wordlists = PlaintextCorpusReader(corpus_root, 'my*\.txt')
print(wordlists.fileids()) # 无数据显示
print(wordlists.readme()) # 需要有README文件
print(wordlists.words('mya.txt')) # 需要有mya.txt文件


# 2.2 条件频率分布 ConditionalFreqDist
# 条件和事件
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said']
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County')]
# 按文件技术词汇
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

genre_word = [(genre, word)
              for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
print(len(genre_word)) # 170576
print(genre_word[:4]) # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]
print(genre_word[-4:]) # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')]
cfd = nltk.ConditionalFreqDist(genre_word)
print(cfd) # <ConditionalFreqDist with 2 conditions>
print(cfd.conditions()) # ['news', 'romance']
print(cfd['news']) # <FreqDist with 14394 samples and 100554 outcomes>
print(cfd['romance']) # <FreqDist with 8452 samples and 70022 outcomes>
print(list(cfd['romance'])) # ['They', 'neither', 'liked', ..., 'expect', 'episode']
print(cfd['romance']['could']) # 193

# 绘制分布图和分布表
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
)

from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1')
)
cfd.tabulate(conditions=['English', 'German_Deutsch'], samples = range(10), cumulative=True)
'''
                  0    1    2    3    4    5    6    7    8    9 
       English    0  185  525  883  997 1166 1283 1440 1558 1638 
German_Deutsch    0  171  263  614  717  894 1013 1110 1213 1275 
'''
# 使用双连词生成随机文本
sent = ['In', 'the', 'beginning', 'God', 'Created', 'the', 'heaven', 'and', 'the', 'earth', '.']
print(nltk.bigrams(sent)) # <generator object bigrams at 0x00000219653297D8>
print(list(nltk.bigrams(sent)))
'''
[('In', 'the'), ('the', 'beginning'), ('beginning', 'God'), ('God', 'Created'), ('Created', 'the'), 
('the', 'heaven'), ('heaven', 'and'), ('and', 'the'), ('the', 'earth'), ('earth', '.')]
'''

# 2.3 更多关于Python代码重用
# 使用文本编辑器创建程序
# 函数
# 模块

# 2.4 词典资源
# 词汇列表语料库
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab.difference(english_vocab)
    return sorted(unusual)
res = unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))
print(res)
'''['abbeyland', 'abhorred', 'abilities', ..., 'yielded', 'youngest']'''

res = unusual_words(nltk.corpus.nps_chat.words())
print(res)
'''['aaaaaaaaaaaaaaaaa', 'aaahhhh', 'abortions', ..., 'zzzzzzzing', 'zzzzzzzz']'''


from nltk.corpus import stopwords
print(stopwords.words('english'))
'''['i', 'me', 'my', 'myself', 'we', ..., 'won', 'wouldn']'''

def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content)/len(text)
print(content_fraction(nltk.corpus.reuters.words())) # 0.735240435097661

puzzle_letters = nltk.FreqDist('egivrvonl')
obligatory = 'r'
wordlist = nltk.corpus.words.words()
res = [w for w in wordlist if len(w) >= 6
 and obligatory in w
 and nltk.FreqDist(w) <= puzzle_letters]
print(res)
'''['glover', 'gorlin', 'govern', 'grovel', 'ignore', ..., 'violer', 'virole']'''

names = nltk.corpus.names
print(names.fileids()) # ['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
res = [w for w in male_names if w in female_names]
print(res)
'''['Abbey', 'Abbie', 'Abby', ..., 'Winnie', 'Winny', 'Wynn']'''

# 发音的词典
entries = nltk.corpus.cmudict.entries()
print(len(entries)) # 133737
for entry in entries[39943:39951]:
    print(entry)
'''
('explorer', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0'])
('explorers', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0', 'Z'])
('explores', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'Z'])
('exploring', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'IH0', 'NG'])
('explosion', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N'])
('explosions', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N', 'Z'])
('explosive', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V'])
('explosively', ['EH2', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V', 'L', 'IY0'])
'''

for word, pron in entries:
    if len(pron) == 3:
        ph1, ph2, ph3 = pron
        if ph1 == 'P' and ph3 == 'T':
            print(word, ph2)
'''
pait EY1
pat AE1
...
put UH1
putt AH1
'''

syllable = ['N', 'IHO', 'K', 'S']
res = [word for word, pron in entries if pron[-4:] == syllable]
print(res)
'''[]'''
res = [w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']
print(res)
'''['autumn', 'column', 'condemn', 'damn', 'goddamn', 'hymn', 'solemn']'''
res = sorted(set(w[:2] for w, pron in entries if pron[0] == 'N' and w[0] != 'n'))
print(res)
'''['gn', 'kn', 'mn', 'pn']'''

def stress(pron):
    return [char for phone in pron for char in phone if char.isdigit()]
res = [w for w, pron in entries if stress(pron) == ['0', '1', '0', '2', '0']]
print(res)
'''['abbreviated', 'abbreviated', 'abbreviating', ..., 'vocabulary', 'voluntarism']'''
res = [w for w, pron in entries if stress(pron) == ['0', '2', '0', '1', '0']]
print(res)
'''['abbreviation', 'abbreviations', 'abomination', ..., 'wakabayashi', 'yekaterinburg']'''

p3 = [(pron[0] + '-' + pron[2], word)
      for (word, pron) in entries
      if pron[0] == 'P' and len(pron) == 3]
cfd = nltk.ConditionalFreqDist(p3)
for template in cfd.conditions():
    if len(cfd[template]) > 10:
        words = cfd[template].keys()
        wordlist = ' '.join(words)
        print(template, wordlist[:70] + "...")
'''
P-P paap paape pap pape papp paup peep pep pip pipe pipp poop pop pope pop...
P-R paar pair par pare parr pear peer pier poor poore por pore porr pour...
P-K pac pack paek paik pak pake paque peak peake pech peck peek perc perk ...
P-S pace pass pasts peace pearse pease perce pers perse pesce piece piss p...
P-L pahl pail paille pal pale pall paul paule paull peal peale pearl pearl...
P-N paign pain paine pan pane pawn payne peine pen penh penn pin pine pinn...
P-Z pais paiz pao's pas pause paws pays paz peas pease pei's perz pez pies...
P-T pait pat pate patt peart peat peet peete pert pet pete pett piet piett...
P-CH patch pautsch peach perch petsch petsche piche piech pietsch pitch pit...
P-UW1 peru peugh pew plew plue prew pru prue prugh pshew pugh...
'''

prondict = nltk.corpus.cmudict.dict()
print(prondict['fire']) # [['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']]
# print(prondict['blog']) # KeyError: 'blog'
prondict['blog'] = [['B', 'L', 'AA1', 'G']]
print(prondict['blog']) # [['B', 'L', 'AA1', 'G']]

text = ['natural', 'language', 'processing']
res = [ph for w in text for ph in prondict[w][0]]
print(res)
'''
['N', 'AE1', 'CH', 'ER0', 'AH0', 'L', 'L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH', 'P', 
'R', 'AA1', 'S', 'EH0', 'S', 'IH0', 'NG']
'''
# 比较词表
from nltk.corpus import swadesh
print(swadesh.fileids())
'''
['be', 'bg', 'bs', 'ca', 'cs', 'cu', 'de', 'en', 'es', 'fr', 'hr', 'it', 'la', 'mk', 'nl', 
'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sw', 'uk']
'''
print(swadesh.words('en'))
'''['I', 'you (singular), thou', ..., 'if', 'because', 'name']'''
fr2en = swadesh.entries(['fr', 'en'])
print(fr2en)
'''[('je', 'I'), ('tu, vous', 'you (singular), thou'), ..., ('parce que', 'because'), ('nom', 'name')]'''
translate = dict(fr2en)
print(translate['chien']) # dog
print(translate['jeter']) # throw
de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
print(translate['Hund']) # dog
print(translate['perro']) # dog

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])
'''
('say', 'sagen', 'zeggen', 'decir', 'dire', 'dizer', 'dicere')
('sing', 'singen', 'zingen', 'cantar', 'chanter', 'cantar', 'canere')
('play', 'spielen', 'spelen', 'jugar', 'jouer', 'jogar, brincar', 'ludere')
('float', 'schweben', 'zweven', 'flotar', 'flotter', 'flutuar, boiar', 'fluctuare')
'''
# 词汇工具:Toolbox和Shoebox
from nltk.corpus import toolbox
print(toolbox.entries('rotokas.dic'))
'''
[('kaa', [('ps', 'V'), ('pt', 'A'), ..., ('tkp', 'laplap'), ('dt', '28/Jul/2004')])]
'''
# 2.5 WordNet
# 意义与同义词
from nltk.corpus import wordnet as wn
print(wn.synsets('motorcar')) # [Synset('car.n.01')]
print(wn.synset('car.n.01').lemma_names()) # ['car', 'auto', 'automobile', 'machine', 'motorcar']
print(wn.synset('car.n.01').definition()) # a motor vehicle with four wheels; usually propelled by an internal combustion engine
print(wn.synset('car.n.01').examples()) # ['he needs a car to get to work']

print(wn.synset('car.n.01').lemmas())
'''
[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), 
Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
'''
print(wn.lemma('car.n.01.automobile')) # Lemma('car.n.01.automobile')
print(wn.lemma('car.n.01.automobile').synset()) # Synset('car.n.01')
print(wn.lemma('car.n.01.automobile').name()) # automobile
print(wn.synsets('car'))
'''[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]'''
for synset in wn.synsets('car'):
    print(synset.lemma_names())
'''
['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']
'''
print(wn.lemmas('car'))

# WordNet的层次结构
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
print(types_of_motorcar[26])
'''[Lemma('car.n.01.car'), Lemma('car.n.02.car'), Lemma('car.n.03.car'), 
Lemma('car.n.04.car'), Lemma('cable_car.n.01.car')]'''
res = sorted([lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas()])
print(res)
'''['Model_T', 'S.U.V.', 'SUV', 'Stanley_Steamer', ..., 'used-car', 'waggon', 'wagon']'''

print(motorcar.hypernyms()) # [Synset('motor_vehicle.n.01')]
paths = motorcar.hypernym_paths()
print(len(paths)) # 2
res = [synset.name() for synset in paths[0]]
print(res)
'''
['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03',
 'container.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']
'''
res = [synset.name() for synset in paths[1]]
print(res)
'''
['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03', 
'conveyance.n.03', 'vehicle.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01', 
'motor_vehicle.n.01', 'car.n.01']
'''
print(motorcar.root_hypernyms()) # [Synset('entity.n.01')]

# 更多的词汇关系
print(wn.synset('tree.n.01').part_meronyms())
'''[Synset('burl.n.02'), Synset('crown.n.07'), Synset('limb.n.02'), Synset('stump.n.01'), Synset('trunk.n.01')]'''
print(wn.synset('tree.n.01').substance_meronyms())
'''[Synset('heartwood.n.01'), Synset('sapwood.n.01')]'''
print(wn.synset('tree.n.01').member_holonyms())
'''[Synset('forest.n.01')]'''
for synset in wn.synsets('mint', wn.NOUN):
    print(synset.name() + ':', synset.definition())
'''
batch.n.02: (often followed by `of') a large number or amount or extent
mint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowers
mint.n.03: any member of the mint family of plants
mint.n.04: the leaves of a mint plant used fresh or candied
mint.n.05: a candy that is flavored with a mint oil
mint.n.06: a plant where money is coined by authority of the government
'''
print(wn.synset('mint.n.04').part_holonyms()) # [Synset('mint.n.02')]
print(wn.synset('mint.n.04').substance_holonyms()) # [Synset('mint.n.05')]
print(wn.synset('walk.v.01').entailments()) # [Synset('step.v.01')]
print(wn.synset('eat.v.01').entailments()) # [Synset('chew.v.01'), Synset('swallow.v.01')]
print(wn.synset('tease.v.03').entailments()) # [Synset('arouse.v.07'), Synset('disappoint.v.01')]

print(wn.lemma('supply.n.02.supply').antonyms()) # [Lemma('demand.n.02.demand')]
print(wn.lemma('rush.v.01.rush').antonyms()) # [Lemma('linger.v.04.linger')]
print(wn.lemma('horizontal.a.01.horizontal').antonyms())
'''[Lemma('inclined.a.02.inclined'), Lemma('vertical.a.01.vertical')]'''
print(wn.lemma('staccato.r.01.staccato').antonyms()) # [Lemma('legato.r.01.legato')]

# 语义相似度
right = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
minke = wn.synset('minke_whale.n.01')
tortoise = wn.synset('tortoise.n.01')
novel = wn.synset('novel.n.01')
print(right.lowest_common_hypernyms(minke)) # [Synset('baleen_whale.n.01')]
print(right.lowest_common_hypernyms(orca)) # [Synset('whale.n.02')]
print(right.lowest_common_hypernyms(tortoise)) # [Synset('vertebrate.n.01')]
print(right.lowest_common_hypernyms(novel)) # [Synset('entity.n.01')]

print(wn.synset('baleen_whale.n.01').min_depth()) # 14
print(wn.synset('whale.n.02').min_depth()) # 13
print(wn.synset('vertebrate.n.01').min_depth()) # 8
print(wn.synset('entity.n.01').min_depth()) # 0

print(right.path_similarity(minke)) # 0.25
print(right.path_similarity(orca)) # 0.16666666666666666
print(right.path_similarity(tortoise)) # 0.07692307692307693
print(right.path_similarity(novel)) # 0.043478260869565216
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值