刚开始接触自然语言处理,记下一些NLTK函数:
第一步:哈啊哈
>>> from nltk.book import *
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
正文:
>>> fdist=FreqDist(text1)
>>> fdist['the']
13721
在text1中the出现的次数
>>> fdist.freq('the')
0.052607363727335814
////////////////源代码///////////////////////////
def freq(self, sample):
"""
Return the frequency of a given sample. The frequency of a
sample is defined as the count of that sample divided by the
total number of sample outcomes that have been recorded by
this FreqDist. The count of a sample is defined as the
number of times that sample outcome was recorded by this
FreqDist. Frequencies are always real numbers in the range
[0, 1].
:param sample: the sample whose frequency
should be returned.
:type sample: any
:rtype: float
"""
if self.N() == 0:
return 0
return self[sample] / self.N()
the 的频率
>>> fdist.N()
260819
////////////////源代码///////////////////////////
def N(self):
"""
Return the total number of sample outcomes that have been
recorded by this FreqDist. For the number of unique
sample values (or bins) with counts greater than zero, use
``FreqDist.B()``.
:rtype: int
"""
return sum(self.values())
返回样本总数,对于独特(永远大于0)的值用下一个函数
>>> fdist.B()
19317
返回所有的样本值
>>> fdist.hapaxes()
[u'funereal', u'unscientific', u'prefix', u'plaudits', u'woody', u'disobeying', u'Westers', u'DRYDEN', u'Untried', u'superficially', u'vesper', u'Western', u'Spurn', u'treasuries',........]
//代码
def hapaxes(self):
"""
Return a list of all samples that occur once (hapax legomena)
:rtype: list
"""
return [item for item in self if self[item] == 1]
验证一下//
//funereal的频率*样本总数
>>> fdist.freq('funereal')*fdist.N()
1.0
出现一次样品的列表
>>> fdist.max()
u','
//代码
def max(self):
"""
Return the sample with the greatest number of outcomes in this
frequency distribution. If two or more samples have the same
number of outcomes, return one of them; which sample is
returned is undefined. If no outcomes have occurred in this
frequency distribution, return None.
:return: The sample with the maximum number of outcomes in this
frequency distribution.
:rtype: any or None
"""
if len(self) == 0:
raise ValueError('A FreqDist must have at least one sample before max is defined.')
return self.most_common(1)[0][0]
频率分布最高的样本
>>> fdist.pformat(maxlen=10)
u"FreqDist({u',': 18713, u'the': 13721, u'.': 6862, u'of': 6536, u'and': 6024, u'a': 4569, u'to': 4542, u';': 4072, u'in': 3916, u'that': 2982, ...})"
//代码
def max(self):
"""
Return the sample with the greatest number of outcomes in this
frequency distribution. If two or more samples have the same
number of outcomes, return one of them; which sample is
returned is undefined. If no outcomes have occurred in this
frequency distribution, return None.
:return: The sample with the maximum number of outcomes in this
frequency distribution.
:rtype: any or None
"""
if len(self) == 0:
raise ValueError('A FreqDist must have at least one sample before max is defined.')
return self.most_common(1)[0][0]
按照频率由高到低输出前(参数)个
>>> fdist.keys()
[u'funereal', u'unscientific', u'divinely', u'foul', u'four', u'gag', u'prefix', u'woods'......]
以频率递减顺序排序的样本链表
>>> fdist.plot(90,cumulative=True)
绘制频率分布图(频率由高到低)
第一个参数—–横轴的点的数量
第二个参数—–是否累加(默认false)
>>> fdist.pprint(30)
FreqDist({u',': 18713, u'the': 13721, u'.': 6862, u'of': 6536, u'and': 6024, u'a': 4569, u'to': 4542, u';': 4072, u'in': 3916, u'that': 2982, u"'": 2684, u'-': 2552, u'his': 2459, u'it': 2209, u'I': 2124, u's': 1739, u'is': 1695, u'he': 1661, u'with': 1659, u'was': 1632, u'as': 1620, u'"': 1478, u'all': 1462, u'for': 1414, u'this': 1280, u'!': 1269, u'at': 1231, u'by': 1137, u'but': 1113, u'not': 1103, ...})
//代码
def pprint(self, maxlen=10, stream=None):
"""
Print a string representation of this FreqDist to 'stream'
:param maxlen: The maximum number of items to print
:type maxlen: int
:param stream: The stream to print to. stdout by default
"""
print(self.pformat(maxlen=maxlen), file=stream)
流输出前(参数)个样本
>>> fdist.r_Nr()
defaultdict(<type 'int'>, {68: 9, 1: 9002, 2: 3193, 3: 1721, 4: 968, 5: 695, 6: 497, 7: 384, 8: 318, 9: 253, 10: 196, 11: 190, 12: 152, 13: 112, 14: 100, 15: 107, 16: 98, 17: 71, 18: 69, 19: 58, 20: 50, 21: 35, 22: 37, 23: 43, 24: 38, 25: 38, 26: 34, 27: 25, 28: 32, 29: 26, 30: 27, 31: 20, 32: 21, 33: 16, ......]
//代码
def r_Nr(self, bins=None):
"""
Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0.
:type bins: int
:param bins: The number of possible sample outcomes. ``bins``
is used to calculate Nr(0). In particular, Nr(0) is
``bins-self.B()``. If ``bins`` is not specified, it
defaults to ``self.B()`` (so Nr(0) will be 0).
:rtype: int
"""
_r_Nr = defaultdict(int)
for count in self.values():
_r_Nr[count] += 1
# Special case for Nr[0]:
_r_Nr[0] = bins - self.B() if bins is not None else 0
return _r_Nr
不是很了解 求教
>>> fdist.tabulate(10,cumulative=True)
, the . of and a to ; in that
18713 32434 39296 45832 51856 56425 60967 65039 68955 71937
>>> fdist.tabulate(10)
, the . of and a to ; in that
18713 13721 6862 6536 6024 4569 4542 4072 3916 2982
//代码
def tabulate(self, *args, **kwargs):
"""
Tabulate the given samples from the frequency distribution (cumulative),
displaying the most frequent sample first. If an integer
parameter is supplied, stop after this many samples have been
plotted.
:param samples: The samples to plot (default is all samples)
:type samples: list
:param cumulative: A flag to specify whether the freqs are cumulative (default = False)
:type title: bool
"""
if len(args) == 0:
args = [len(self)]
samples = [item for item, _ in self.most_common(*args)]
cumulative = _get_kwarg(kwargs, 'cumulative', False)
if cumulative:
freqs = list(self._cumulative_frequencies(samples))
else:
freqs = [self[sample] for sample in samples]
# percents = [f * 100 for f in freqs] only in ProbDist?
width = max(len("%s" % s) for s in samples)
width = max(width, max(len("%d" % f) for f in freqs))
for i in range(len(samples)):
print("%*s" % (width, samples[i]), end=' ')
print()
for i in range(len(samples)):
print("%*d" % (width, freqs[i]), end=' ')
print()
制表
第一个参数—–个数
第二个参数—–是否累加
>>> fdist.most_common()
[(u',', 18713), (u'the', 13721), (u'.', 6862), (u'of', 6536), (u'and', 6024), (u'a', 4569), (u'to', 4542), (u';', 4072), (u'in', 3916), (u'that', 2982), (u"'", 2684), (u'-', 2552), (u'his', 2459),.....]
返回样本中各个词的个数,从高到低排列
>>> sent=['i','m','aa','bb','cc']
>>> import nltk
>>> nltk.bigrams(sent)
>>> for i in nltk.bigrams(sent):
... print i
...
('i', 'm')
('m', 'aa')
('aa', 'bb')
('bb', 'cc')
>>>
将sent中的词两两组合