NLP with python 2 获取文本语料和词汇资源

最新推荐文章于 2024-07-17 13:55:08 发布

tttmusic

最新推荐文章于 2024-07-17 13:55:08 发布

阅读量3.5k

点赞数

分类专栏：自然语言处理

本文链接：https://blog.csdn.net/tttmusic/article/details/8563352

版权

自然语言处理专栏收录该内容

4 篇文章 0 订阅

订阅专栏

1、获取文本语料库

for fileid in gutenberg.fileids():

num_chars = len(gutenberg.raw(fileid));
num_words = len(gutenberg.words(fileid));
num_sents = len(gutenberg.sents(fileid));
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))

print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab), fileid

>>> from nltk.corpus import brown
>>> brown.categories()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
>>> hobbies_text = brown.words(categories = 'hobbies')
>>> fdist = FreqDist([words.lower() for words in hobbies_text])
>>> wh_words = ['what','when','where']
>>> for wh in wh_words:
print wh +':',fdist[wh]

what: 108
when: 164
where: 77

>>> cfd = nltk.ConditionalFreqDist(
(genre,words)
for genre in brown.categories()
for words in brown.words(categories = genre))

>>> grnres = ['adventure','editorial']
>>> words = ['what','when','how']
>>> cfd.tabulate(conditions = grnres, samples = words)
what when how
adventure 110 126 35
editorial 84 103 43

>>> from nltk.corpus import reuters
>>> aa = reuters.fileids();
>>> aa[:1]
['test/14826']
>>> reuters.categories('test/14826')
['trade']
>>> reuters.words('test/14826')[:10]
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN']

>>> from nltk.corpus import reuters
>>> reuters.categories()[:3]
['acq', 'alum', 'barley']
>>> reuters.words(categories = ['acq','alum'])
['SUMITOMO', 'BANK', 'AIMS', 'AT', 'QUICK', 'RECOVERY', ...]

>>> cfd = nltk.ConditionalFreqDist(
(lan,len(words))
for lan in lang
for words in udhr.words(lan+'-Latin1'))
>>> cfd.plot(cumulative = 'True')

>>> raw_text = udhr.raw('Zulu-Latin1')
>>> nltk.FreqDist(raw_text).plot()

>>> from nltk.corpus import PlaintextCorpusReader
>>> corpus_root = '/usr/share/dict'

2、条件概率分布

>>> from nltk.corpus import brown
>>> genre_word = [(genre,word)
for genre in brown.categories()
for word in brown.words(categories = genre)]
>>> len(genre_word)
1161192
>>> genre_word[:4]
[('adventure', 'Dan'), ('adventure', 'Morgan'), ('adventure', 'told'), ('adventure', 'himself')]
>>> cfd = nltk.ConditionalFreqDist(genre_word)
>>> import nltk
>>> cfd = nltk.ConditionalFreqDist(genre_word)
>>> cfd
<ConditionalFreqDist with 15 conditions>
>>> cfd.conditions()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
>>> cfd['science_fiction']['the']
652

3、更多关于PYTHON

给文件一个小而准确的名字，使用所有的小写字母，用下划线分割词汇，使用.py 文件名后缀，例如：monty_python.py。

在一个文件中的变量和函数定义的集合被称为一个Python 模块（module）。相关模块的集合称为一个包（package）。处理布朗语料库的NLTK 代码是一个模块，处理各种不同的语料库的代码的集合是一个包。NLTK 的本身是包的集合，有时被称为一个库（library）。

4、词典资源

>>> def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual= text_vocab.difference(english_vocab)
return sorted(unusual)

>>> unususal_words(nltk.corpus.gutenberg.words('austen-sense.txt'))

人名

>>> from nltk.corpus import names
>>> names.fileids()
['female.txt', 'male.txt']
>>> cfd= nltk.ConditionalFreqDist(
(fileid,name[-1])
for fileid in names.fileids()
for name in names.words(fileid)
)
>>> cfd.plot()

发音

fr2en= swadesh.entries(['fr','en'])

5 WordNet

>>> from nltk.corpus import wordnet
>>> wordnet.synsets('motorcar')
[Synset('car.n.01')]
>>> wordnet.synset('car.n.01').lemma_names
['car', 'auto', 'automobile', 'machine', 'motorcar']

词条：car.n.01.motorcar(car单词作为名词的第一个意思（同义词集+词）)

>>> wordnet.synset('car.n.01').examples
['he needs a car to get to work']
>>> wordnet.synset('car.n.01').definition
'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

>>> for synset in wordnet.synsets('car'):
print synset.lemma_names

['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']

>>> wordnet.lemmas('car')
[Lemma('car.n.01.car'), Lemma('car.n.02.car'), Lemma('car.n.03.car'), Lemma('car.n.04.car'), Lemma('cable_car.n.01.car')]

>>> wordnet.synset('big_cat.n.01').definition
'any of several large cats typically able to roar and living in the wild'
>>> wordnet.synset('big_cat.n.01').example

>>> types_of_motorcar = types_of_motorcar.hyponyms()
>>> sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])
['Model_T', 'S.U.V.', 'SUV', 'Stanley_Steamer', 'ambulance', 'beach_waggon', 'beach_wagon', 'bus', 'cab', 'compact', 'compact_car', 'convertible', 'coupe', 'cruiser', 'electric', 'electric_automobile', 'electric_car', 'estate_car', 'gas_guzzler', 'hack', 'hardtop', 'hatchback', 'heap', 'horseless_carriage', 'hot-rod', 'hot_rod', 'jalopy', 'jeep', 'landrover', 'limo', 'limousine', 'loaner', 'minicar', 'minivan', 'pace_car', 'patrol_car', 'phaeton', 'police_car', 'police_cruiser', 'prowl_car', 'race_car', 'racer', 'racing_car', 'roadster', 'runabout', 'saloon', 'secondhand_car', 'sedan', 'sport_car', 'sport_utility', 'sport_utility_vehicle', 'sports_car', 'squad_car', 'station_waggon', 'station_wagon', 'stock_car', 'subcompact', 'subcompact_car', 'taxi', 'taxicab', 'tourer', 'touring_car', 'two-seater', 'used-car', 'waggon', 'wagon']

>>> motorcar = wn.synset('car.n.01')
>>> motorcar.hypernyms()
[Synset('motor_vehicle.n.01')]
>>> paths = motorcar.hypernym_paths()
>>> len(paths)
2
>>> paths
[[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('instrumentality.n.03'), Synset('container.n.01'), Synset('wheeled_vehicle.n.01'), Synset('self-propelled_vehicle.n.01'), Synset('motor_vehicle.n.01'), Synset('car.n.01')], [Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('instrumentality.n.03'), Synset('conveyance.n.03'), Synset('vehicle.n.01'), Synset('wheeled_vehicle.n.01'), Synset('self-propelled_vehicle.n.01'), Synset('motor_vehicle.n.01'), Synset('car.n.01')]]

>>> for synset in wn.synsets('mint',wn.NOUN):
print synset.name+':',synset.definition

batch.n.02: (often followed by `of') a large number or amount or extent
mint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowers

>>> wn.synset('walk.v.01').entailments()
[Synset('step.v.01')]

>>> wn.lemma('supply.n.02.supply').antonyms()

[Lemma('demand.n.02.demand')]

>>> dir(wn.synset('harmony.n.02'))
['__class__', '__delattr__', '__dict__'

>>> wn.synset('vertebrate.n.01').min_depth()
8
>>> wn.synset('entity.n.01').min_depth()
0

>>> right.path_similarity(tortoise)
0.076923076923076927
>>> right.path_similarity(novel)
0.043478260869565216