处理原始文本

友情提醒:NLTK study notes 系列均为从本地代码粘贴上来的,粘贴时缩进出现大量错误,敬请注意。

包括以下章节:用正则表达式提取特征、词性规范化、文本分词、分割等

正则表达式提取特征

查找word中的所有元音

import nltk, re

    word = "supercalifragilistcexpialidocious"
	re.findall(r'[aeiou]', word)

	#wsj中2个及以上的元音序列 建立频率分布
	wsj = sorted(set(nltk.corpus.tree.bank.words()))    #bank语料库中的所有单词
	fd = nltk.FreqDist(vs for word in wsj
	                   for vs in re.findall(r'[aeiou]{2,}', word))    #频率分布
	#序列及其频率
    fd.items()    

改写单词

	regex = r'^[aeiouAEIOU]+|[aeiouAEIOU]+$|[^aeiouAEIOU]'
    #匹配词首词尾的元音序列、所有辅音
	def compress(word):
	    pieces = re.findall(regex, word)
	    return ' '.join(pieces)

元音—辅音序列的频率分布

	rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')    #toolbox语料库
	cvs = [cv for w in rotokas_words
	       for cv in re.findall(r'[ptksvr][aeiou]', w)]
    #匹配辅音-元音序列
	cfd = nltk.ConditionalFreqDist(cvs)
    #条件频率分布
	cfd.tabulate()
    #显示分布的表格

检出含有特定元音-辅音序列的单词

	cv_word_pairs = [(cv, w) for w in rotokas_words
	                 for cv in re.findall(r'[ptksvr][aeiou]', w)]
    #(序列,单词)元组列表
	cv_index = nltk.Index(cv_word_pairs)
	#cv_index为以cv为键,以word列表为值的字典
	cv_index['su']
    #含有“su”的所有单词

查找词干

	regex = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    #常见词缀
	def stem(word):
        #词干提取
	    stem, suffix = re.findall(regex, word)[0]
	    return stem

搜索已分词的文本

	from nltk.corpus import gutenberg, nps_chat
	regex1 = r'<a>(<.*>)<man>'   #匹配a adj. man中的adj.
	moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
	moby.findall(regex)
	regex2 = r'<.*><.*><bro>'   #匹配以bro为结尾的3个单词的短语
	regex3 = r'<l.*>{3,}'   #匹配至少3个l开头的单词
	chat = nltk.Text(nps_chat.words())
	chat.findall(regex2)

词形规范化

内置词干提取器

raw = """DENNIS: Listen, strange women lying in ponds distributing swords is 
    no basis for a system of government. Supreme executive power derives
    from a mandate from the masses, not from some farcical aquatic 
    ceremony."""
tokens = nltk.word_tokenize(raw)
	
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()     #索引文本、搜索支持不同词汇形式
[porter.stem(t) for t in tokens]        #用Porter词干提取器

	class IndexText(object):
    #包装提取器
	    def __init__(self, stemmer, text):
	        self._stemmer = stemmer
	        self._text = text
	        self._index = nltk.Index((self._stemmer(word), i)
	                                 for (i, word) in enumerate(text))
	
	    def concordance(self, word, width=40):
	        #索引上下文
	        key = self._stemmer(word)
	        wc = width / 4
	        for i in self._index[key]:
	            lcontext = ' '.join(self._text[i-wc:i])
	            rcontext = ' '.join(self._text[i:i+wc])
	            ldisplay = '%*s' % (width, lcontext[-width:])
	            rdisplay = '%-*s' % (width, rcontext[:width])
	            print(ldisplay, rdisplay)
	
	    def _stem(self, word):
	        return self._stemmer.stem(word).lower()

	porter = nltk.PorterStemmer()
	grail = nltk.corpus.webtext.words('grail.txt')
	text = IndexText(porter, grail)
	text.concordance('lie')

词形归并

	wml = nltk.WordNetLemmatizer()      #编译文本词汇、得到单词原形
	[wml.lemmatize(t) for t in tokens]

正则文本分词

	text = ""
	pattern = r'''(?x)  #set flag to allow verbose regexps
	      ([A-Z]\.)+  #abbreviations, e.g. U.S.A.
	    | \w+(-\w+)*  #words with optional internal hyphens
	    | \$?\d+(\.\d+)?%? #currency and percentages, e.g. $12.40, 82%
	    | \.\.\.      ellipsis
	    |[][.,;"'?():-_`] #these are separate tokens
	    '''
	nltk.regexp_tokenize(text, pattern)
    #nltk内置正则分词器

分割

断句

	text =   "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
	seg1 = "0000000000000001000000000010000000000000000100000000000"
	seg2 = "0100100100100001001001000010100100010010000100010010000"
	seg3 = "0000100100000011001000000110000100010000001100010000001"
	
    #用0/1标示此位置是否有分词标志
    
	def segment(text, seg):
    #以1为标志分割
	    words = []
	    last = 0
	    for i in range(len(seg)):
	        if seg[i] == '1':
	            words.append(text[last:i+1])
	            last = i + 1
	    words.append(text[last:])
	    return words
	
	segment(text, seg2)

	def evaluate(text, seg):
    #一个简易打分函数
	    words = segment(text, seg)
	    text_size = len(words)
	    lexicon_size = len(' '.join(list(set(words))))
	    return text_size + lexicon_size


	def flip(segs, pos):
    #随机添加1标记
	    segs = segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
	    return segs
	
	def flip_n(segs, n):
    #调用flip函数n次
	    for i in n:
	        segs= flip(segs, randint(0, len(segs)-1))
	    return segs
	
	def anneal(text, segs, iterations, cooling_rate):
    #断句
    #cooling_rate的作用相当于learning_rate
	    temperature = float(len(segs))
	    while temperature < 0.5:
	        best_segs, best = segs, evaluate(text, segs)
	        for i in range(iterations):
	            guess = flip_n(segs, int(round(temperature)))
	            score = evaluate(text, guess)
	            if score < best:
	                best, best_segs = score, segs
	        score, segs = best, best_segs
	        temperature = temperature / cooling_rate
	        print(evaluate(text, segs), segment(text, segs))
	    return segs

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值