友情提醒:NLTK study notes 系列均为从本地代码粘贴上来的,粘贴时缩进出现大量错误,敬请注意。
包括以下章节:用正则表达式提取特征、词性规范化、文本分词、分割等
正则表达式提取特征
查找word中的所有元音
import nltk, re
word = "supercalifragilistcexpialidocious"
re.findall(r'[aeiou]', word)
#wsj中2个及以上的元音序列 建立频率分布
wsj = sorted(set(nltk.corpus.tree.bank.words())) #bank语料库中的所有单词
fd = nltk.FreqDist(vs for word in wsj
for vs in re.findall(r'[aeiou]{2,}', word)) #频率分布
#序列及其频率
fd.items()
改写单词
regex = r'^[aeiouAEIOU]+|[aeiouAEIOU]+$|[^aeiouAEIOU]'
#匹配词首词尾的元音序列、所有辅音
def compress(word):
pieces = re.findall(regex, word)
return ' '.join(pieces)
元音—辅音序列的频率分布
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic') #toolbox语料库
cvs = [cv for w in rotokas_words
for cv in re.findall(r'[ptksvr][aeiou]', w)]
#匹配辅音-元音序列
cfd = nltk.ConditionalFreqDist(cvs)
#条件频率分布
cfd.tabulate()
#显示分布的表格
检出含有特定元音-辅音序列的单词
cv_word_pairs = [(cv, w) for w in rotokas_words
for cv in re.findall(r'[ptksvr][aeiou]', w)]
#(序列,单词)元组列表
cv_index = nltk.Index(cv_word_pairs)
#cv_index为以cv为键,以word列表为值的字典
cv_index['su']
#含有“su”的所有单词
查找词干
regex = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
#常见词缀
def stem(word):
#词干提取
stem, suffix = re.findall(regex, word)[0]
return stem
搜索已分词的文本
from nltk.corpus import gutenberg, nps_chat
regex1 = r'<a>(<.*>)<man>' #匹配a adj. man中的adj.
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(regex)
regex2 = r'<.*><.*><bro>' #匹配以bro为结尾的3个单词的短语
regex3 = r'<l.*>{3,}' #匹配至少3个l开头的单词
chat = nltk.Text(nps_chat.words())
chat.findall(regex2)
词形规范化
内置词干提取器
raw = """DENNIS: Listen, strange women lying in ponds distributing swords is
no basis for a system of government. Supreme executive power derives
from a mandate from the masses, not from some farcical aquatic
ceremony."""
tokens = nltk.word_tokenize(raw)
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer() #索引文本、搜索支持不同词汇形式
[porter.stem(t) for t in tokens] #用Porter词干提取器
class IndexText(object):
#包装提取器
def __init__(self, stemmer, text):
self._stemmer = stemmer
self._text = text
self._index = nltk.Index((self._stemmer(word), i)
for (i, word) in enumerate(text))
def concordance(self, word, width=40):
#索引上下文
key = self._stemmer(word)
wc = width / 4
for i in self._index[key]:
lcontext = ' '.join(self._text[i-wc:i])
rcontext = ' '.join(self._text[i:i+wc])
ldisplay = '%*s' % (width, lcontext[-width:])
rdisplay = '%-*s' % (width, rcontext[:width])
print(ldisplay, rdisplay)
def _stem(self, word):
return self._stemmer.stem(word).lower()
porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexText(porter, grail)
text.concordance('lie')
词形归并
wml = nltk.WordNetLemmatizer() #编译文本词汇、得到单词原形
[wml.lemmatize(t) for t in tokens]
正则文本分词
text = ""
pattern = r'''(?x) #set flag to allow verbose regexps
([A-Z]\.)+ #abbreviations, e.g. U.S.A.
| \w+(-\w+)* #words with optional internal hyphens
| \$?\d+(\.\d+)?%? #currency and percentages, e.g. $12.40, 82%
| \.\.\. ellipsis
|[][.,;"'?():-_`] #these are separate tokens
'''
nltk.regexp_tokenize(text, pattern)
#nltk内置正则分词器
分割
断句
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
#用0/1标示此位置是否有分词标志
def segment(text, seg):
#以1为标志分割
words = []
last = 0
for i in range(len(seg)):
if seg[i] == '1':
words.append(text[last:i+1])
last = i + 1
words.append(text[last:])
return words
segment(text, seg2)
def evaluate(text, seg):
#一个简易打分函数
words = segment(text, seg)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
return text_size + lexicon_size
def flip(segs, pos):
#随机添加1标记
segs = segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
return segs
def flip_n(segs, n):
#调用flip函数n次
for i in n:
segs= flip(segs, randint(0, len(segs)-1))
return segs
def anneal(text, segs, iterations, cooling_rate):
#断句
#cooling_rate的作用相当于learning_rate
temperature = float(len(segs))
while temperature < 0.5:
best_segs, best = segs, evaluate(text, segs)
for i in range(iterations):
guess = flip_n(segs, int(round(temperature)))
score = evaluate(text, guess)
if score < best:
best, best_segs = score, segs
score, segs = best, best_segs
temperature = temperature / cooling_rate
print(evaluate(text, segs), segment(text, segs))
return segs