NLP with python 2 获取文本语料和词汇资源

1、获取文本语料库

for fileid in gutenberg.fileids():

num_chars = len(gutenberg.raw(fileid));
num_words = len(gutenberg.words(fileid));
num_sents = len(gutenberg.sents(fileid));
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))

print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab), fileid


>>> from nltk.corpus import brown
>>> brown.categories()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
>>> hobbies_text = brown.words(categories = 'hobbies')
>>> fdist = FreqDist([words.lower() for words in hobbies_text])
>>> wh_words = ['what','when','where']
>>> for wh in wh_words:
print wh +':',fdist[wh]



what: 108
when: 164
where: 77


>>> cfd = nltk.ConditionalFreqDist(
(genre,words)
for genre in brown.categories()
for words in brown.words(categories = genre))




>>> grnres = ['adventure','editorial']
>>> words = ['what','when','how']
>>> cfd.tabulate(conditions = grnres, samples = words)
                  what when  how
adventure  110  126   35
editorial        84  103   43


>>> from nltk.corpus import reuters
>>> aa = reuters.fileids();
>>> aa[:1]
['test/14826']
>>> reuters.categories('test/14826')
['trade']
>>> reuters.words('test/14826')[:10]
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN']


>>> from nltk.corpus import reuters
>>> reuters.categories()[:3]
['acq', 'alum', 'barley']
>>> reuters.words(categories = ['acq','alum'])
['SUMITOMO', 'BANK', 'AIMS', 'AT', 'QUICK', 'RECOVERY', ...]


>>> cfd = nltk.ConditionalFreqDist(
(lan,len(words))
for lan in lang
for words in udhr.words(lan+'-Latin1'))
>>> cfd.plot(cumulative = 'True')


>>> raw_text = udhr.raw('Zulu-Latin1')
>>> nltk.FreqDist(raw_text).plot()


>>> from nltk.corpus import PlaintextCorpusReader
>>> corpus_root = '/usr/share/dict'


2、条件概率分布


>>> from nltk.corpus import brown
>>> genre_word = [(genre,word)
     for genre in brown.categories()
     for word in brown.words(categories = genre)]
>>> len(genre_word)
1161192
>>> genre_word[:4]
[('adventure', 'Dan'), ('adventure', 'Morgan'), ('adventure', 'told'), ('adventure', 'himself')]
>>> cfd = nltk.ConditionalFreqDist(genre_word)
>>> import nltk
>>> cfd = nltk.ConditionalFreqDist(genre_word)
>>> cfd
<ConditionalFreqDist with 15 conditions>
>>> cfd.conditions()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
>>> cfd['science_fiction']['the']
652



3、 更多关于PYTHON

给文件一个小而准确的名字,使用所有的小写字母,用下划线分割词汇,使用.py 文件名后缀,例如:monty_python.py。

在一个文件中的变量和函数定义的集合被称为一个Python 模块(module)。相关模块的集合称为一个包(package)。处理布朗语料库的NLTK 代码是一个模块,处理各种不同的语料库的代码的集合是一个包。NLTK 的本身是包的集合,有时被称为一个库(library)。

4、 词典资源


>>> def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual= text_vocab.difference(english_vocab)
return sorted(unusual)


>>> unususal_words(nltk.corpus.gutenberg.words('austen-sense.txt'))


人名

>>> from nltk.corpus import names
>>> names.fileids()
['female.txt', 'male.txt']
>>> cfd= nltk.ConditionalFreqDist(
(fileid,name[-1])
for fileid in names.fileids()
for name in names.words(fileid)
)
>>> cfd.plot()


发音

 fr2en= swadesh.entries(['fr','en'])


5 WordNet


>>> from nltk.corpus import wordnet
>>> wordnet.synsets('motorcar')
[Synset('car.n.01')]
>>> wordnet.synset('car.n.01').lemma_names
['car', 'auto', 'automobile', 'machine', 'motorcar']

词条:car.n.01.motorcar(car单词作为名词的第一个意思(同义词集+词))

>>> wordnet.synset('car.n.01').examples
['he needs a car to get to work']
>>> wordnet.synset('car.n.01').definition
'a motor vehicle with four wheels; usually propelled by an internal combustion engine'


>>> for synset in wordnet.synsets('car'):
print synset.lemma_names

['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']

>>> wordnet.lemmas('car')
[Lemma('car.n.01.car'), Lemma('car.n.02.car'), Lemma('car.n.03.car'), Lemma('car.n.04.car'), Lemma('cable_car.n.01.car')]


>>> wordnet.synset('big_cat.n.01').definition
'any of several large cats typically able to roar and living in the wild'
>>> wordnet.synset('big_cat.n.01').example


>>> types_of_motorcar = types_of_motorcar.hyponyms()
>>> sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])
['Model_T', 'S.U.V.', 'SUV', 'Stanley_Steamer', 'ambulance', 'beach_waggon', 'beach_wagon', 'bus', 'cab', 'compact', 'compact_car', 'convertible', 'coupe', 'cruiser', 'electric', 'electric_automobile', 'electric_car', 'estate_car', 'gas_guzzler', 'hack', 'hardtop', 'hatchback', 'heap', 'horseless_carriage', 'hot-rod', 'hot_rod', 'jalopy', 'jeep', 'landrover', 'limo', 'limousine', 'loaner', 'minicar', 'minivan', 'pace_car', 'patrol_car', 'phaeton', 'police_car', 'police_cruiser', 'prowl_car', 'race_car', 'racer', 'racing_car', 'roadster', 'runabout', 'saloon', 'secondhand_car', 'sedan', 'sport_car', 'sport_utility', 'sport_utility_vehicle', 'sports_car', 'squad_car', 'station_waggon', 'station_wagon', 'stock_car', 'subcompact', 'subcompact_car', 'taxi', 'taxicab', 'tourer', 'touring_car', 'two-seater', 'used-car', 'waggon', 'wagon']


>>> motorcar = wn.synset('car.n.01')
>>> motorcar.hypernyms()
[Synset('motor_vehicle.n.01')]
>>> paths = motorcar.hypernym_paths()
>>> len(paths)
2
>>> paths
[[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('instrumentality.n.03'), Synset('container.n.01'), Synset('wheeled_vehicle.n.01'), Synset('self-propelled_vehicle.n.01'), Synset('motor_vehicle.n.01'), Synset('car.n.01')], [Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('instrumentality.n.03'), Synset('conveyance.n.03'), Synset('vehicle.n.01'), Synset('wheeled_vehicle.n.01'), Synset('self-propelled_vehicle.n.01'), Synset('motor_vehicle.n.01'), Synset('car.n.01')]]


>>> for synset in wn.synsets('mint',wn.NOUN):
print synset.name+':',synset.definition

batch.n.02: (often followed by `of') a large number or amount or extent
mint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowers


>>> wn.synset('walk.v.01').entailments()
[Synset('step.v.01')]


>>> wn.lemma('supply.n.02.supply').antonyms()


[Lemma('demand.n.02.demand')]


>>> dir(wn.synset('harmony.n.02'))
['__class__', '__delattr__', '__dict__'


>>> wn.synset('vertebrate.n.01').min_depth()
8
>>> wn.synset('entity.n.01').min_depth()
0


>>> right.path_similarity(tortoise)
0.076923076923076927
>>> right.path_similarity(novel)
0.043478260869565216

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值