回答下列问题:
(1)怎样才能识别出语言数据中明显用于分类的特征?
(2)怎样才能构建用于自动执行语言处理任务的语言模型?
(3)从这些模型中我们可以学到哪些关于语言的知识?
决策树,朴素贝叶斯分类器和最大熵(shang)分类
一 监督式分类
#性别鉴定
创建分类
def gender_features(word):
return {'last_letter': word[-1]}
gender_features('Shrek')
{'last_letter': 'k'}
from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
import nltk
featuresets = [ (gender_features(n), g) for (n,g) in names ]
train_set, test_set = featuresets[500:], featuresets[:500] #训练集和测试集
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Neo'))
'male'
classifier.classify(gender_features('Trinity'))
'female'
print nltk.classify.accuracy(classifier, test_set) #评估
0.75
classifier.show_most_informative_features(5) #哪些特征对于区分名字的性别是最有效的
Most Informative Features
last_letter = u'a' female : male = 33.4 : 1.0
last_letter = u'k' male : female = 30.8 : 1.0
last_letter = u'f' male : female = 17.3 : 1.0
last_letter = u'p' male : female = 10.5 : 1.0
last_letter = u'd' male : female = 10.0 : 1.0
#选择正确的特征
def gender_features2(name):
features = {}
features["firstletter"] = name[0].lower()
features["lastletter"] = name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = (letter in name.lower())
return features
gender_features2('JJohn')
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set) #使用朴素贝叶斯分类器
print nltk.classify.accuracy(classifier, test_set)
0.776
#一种能有效完善特征集的方法称为错误分析。首先,选择开发集,其中包含用于创建模型的语料数据。然后将这种开发集分为训练集和开发测试集
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
test_set = [(gender_features(n),g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)
0.766
errors = []
for (name, tag) in devtest_names:
guess = classifier.classify(gender_features(name))
if guess != tag:
errors.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors):
print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
correct=female guess=male name=Abagael
correct=female guess=male name=Adel
correct=female guess=male name=Alys
correct=female guess=male name=Amargo
correct=female guess=male name=Ambur
...
#调整特征提取器使其包含两个字母后缀的特征
def gender_features(word):
return {'suffix1': word[-1:],
'suffix2': word[-2:]}
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)
0.784
#文档分类
将电影评论语料库归类为正面或负面
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
<text, categories>
#文档分类的特征提取器,其特征表示每个词是否在一个给定的文档中
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
print document_features(movie_reviews.words('pos/cv957_8737.txt'))
{u'contains(corporate)': False, u'contains(barred)': False, u'contains(batmans)': False, u'contains(menacing)': False,
u'contains(rags)': False, u'contains(inquires)': False,
#训练和测试分类器以进行文档分类
featuresets = [(document_features(d),c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
0.73
classifier.show_most_informative_features(5) #找出哪些特征是分类器发现的并且是最有信息量的
Most Informative Features
contains(sans) = True neg : pos = 9.1 : 1.0
contains(mediocrity) = True neg : pos = 7.8 : 1.0
contains(dismissed) = True pos : neg = 6.9 : 1.0
contains(testament) = True pos : neg = 6.5 : 1.0
contains(bruckheimer) = True neg : pos = 6.4 : 1.0
#词性标注
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist[word[-1:]] += 1
suffix_fdist[word[-2:]] += 1
suffix_fdist[word[-3:]] += 1
from operator import itemgetter
common_suffixes = sorted(suffix_fdist.items(), key=itemgetter(1), reverse=True)
common_suffixes[:100]
[(u'e', 202946),
(u',', 175002),
(u'.', 152999),
(u's', 128722),
(u'd', 105687),
(u't', 94459),
common_suf = [ suffix[0] for suffix in common_suffixes][:100]
common_suf
定义一个特征提取器函数,用来检查给定单词的后缀训练新的“决策树”的分类器def pos_features(word): features = {} for suffix in common_suf: features['endswith(%s)'%suffix] = word.lower().endswith(suffix) return features
tagged_words = brown.tagged_words(categories='news') tagged_words[0] (u'The', u'AT') len(tagged_words) 100554 len(pos_features(tagged_words[0][0])) 100 pos_features(tagged_words[0][0]) {u"endswith('')": False, u"endswith(')": False, u"endswith('s)": False, u'endswith(()': False, u'endswith())': False, u'endswith(,)': False, featuresets = [(pos_features(n),g) for (n,g) in tagged_words] size = int(len(featuresets) * 0.1) size Out[52]: 10055 train_set,test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set) #决策树 nltk.classify.accuracy(classifier, test_set) 0.6270512182993535 classifier.classify(pos_features('cats')) Out[54]: u'NNS' #决策树的优点是容易解释,甚至可以它们以伪代码形式输出 print classifier.pseudocode(depth=4) if endswith(the) == False: if endswith(,) == False: if endswith(s) == False: if endswith(.) == False: return u'.' if endswith(.) == True: return u'.' if endswith(s) == True: if endswith(is) == False: return u'PP$' if endswith(is) == True: return u'BEZ' if endswith(,) == True: return u',' if endswith(the) == True: return u'AT'
#探索上下文语境
不是只传递已标注的词,而是传递整个(未标注的)句子,以及目标词的索引
#特征检测器
>>>def pos_features(sentence, i): features = {"suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:]} if i == 0: features["prev-word"] = "<START>" else: features["prev-word"] = sentence[i-1] return features brown.sents()[0][7] Out[62]: u'an' brown.sents()[0][8] Out[63]: u'investigation' pos_features(brown.sents()[0], 8) ###### 四个特征 {'prev-word': u'an', 'suffix(1)': u'n', 'suffix(2)': u'on', 'suffix(3)': u'ion'}
tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = nltk.tag.untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append( (pos_features(untagged_sent, i), tag) ) size = int(len(featuresets) * 0.1) 10055
train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) nltk.classify.accuracy(classifier, test_set) 0.7891596220785678
#序列分类
在词性标注的例子中,可以使用各种不同的序列分类器模型为给定的句子中的所有词选择词性标注
一种称为连续分类或贪婪序列分类的序列分类器策略,为第一个输入找到最有可能的类标签,然后在此基础上找到下一个输入的最佳的标签。这个过程可以不断重复直到所有的输入都被贴上标签。
特征提取器
def pos_features(sentence, i, history): features = {"suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:] } if i == 0: features["prev-word"] = "<START>" features["prev-tag"] = "<START>" else: features["prev-word"] = sentence[i-1] features["pre-tag"] = history[i-1] return features
class ConsecutivePosTagger(nltk.TaggerI): def __init__(self, train_sents): train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = pos_features(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) ###### self.classifier = nltk.NaiveBayesClassifier.train(train_set) def tag(self, sentence): history = [] for i, word in enumerate(sentence): featureset = pos_features(sentence, i, history) tag = self.classifier.classify(featureset) history.append(tag) return zip(sentence, history)
tagged_sents = brown.tagged_sents(categories='news') size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] tagger = ConsecutivePosTagger(train_sents) print tagger.evaluate(test_sents)
#其他序列分类方法
这种方法的缺点是一旦做出决定便无法更改。例如:如果决定将一个词标注为名词,但后来发现应该是动词,那也没有办法修复我们的错误了。解决这个问题的方法是采取转型策略。转型联合分类的工作原理是为输入的标签创建一个初始值,然后反复提炼该值,尝试修复相关输入之间的不一致
另一种方案是为词性标记所有可能的序列打分,选择总得分最高的序列。隐马尔科夫模型就采取了这种方法。隐maerkefumox类似于连续分类器,不光考虑输入也考虑已预测标记的历史。然而,不是简单地找出一个给定词的单个最好标签,而是为标记产生一个概率分布。然后这些概率结合起来计算标记序列的概率得分,最后选择最高概率的标记序列。不过,可能的标签序列数量相当大。给定拥有30个标签的标记集,大约有600万亿(30^10)中方式来标记一个10个词的句子。为了避免单独考虑所有这些可能的序列,隐马尔科夫模型要求特征提取器只考虑最近的标记(或最近的n个标记,其中n是相当小的)。由于这种限制,它可以使用动态规划来有效地找出最有可能的标记序列。特别是,对每个连续的词索引i,当前的及以前的每个可能的标记都将计算得分。这种基础的方法被两个更先进的模型所采用,它们被称为最大熵马尔科夫模型和线性链条件随机场模型;但为标记序列打分用的是不同的算法。
二 监督式分类的举例
#句子分割
第一步是获得一些已被分割成句子的数据,将它转换成一种适合提取特征的形式
sents = nltk.corpus.treebank_raw.sents() tokens = [] boundaries = set() offset = 0 for sent in nltk.corpus.treebank_raw.sents(): tokens.extend(sent) offset += len(sent) boundaries.add(offset - 1)
def punct_features(tokens, i): return { 'next-word-capitailized': tokens[i+1][0].isupper(), 'prevword': tokens[i-1].lower(), 'punct': tokens[i], 'prev-word-is-one-char': len(tokens[i-1]) == 1}
featuresets = [(punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens)-1) if tokens[i] in '.?!'] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) nltk.classify.accuracy(classifier, test_set) 0.936026936026936
def segment_sentences(words): #基于分类的断句器 start = 0 sents = [] for i, word in words: if word in '.?!' and classifier.classify(punct_features(words, i)) == True: sents.append(words[start:i+1]) start = i+1 if start < len(words): sents.append(words[start:]) return sents
#识别对话行为类型
表述行为的陈述句,问候,问题,回答,断言和说明都可以被认为是基于语言的行为类型。识别对话中隐含言语下的对话行为是理解谈话的重要步骤。
利用NPS聊天语料库建立一个分类器,用来识别新的即时消息帖子的对话行为类型。
posts = nltk.corpus.nps_chat.xml_posts()[:10000] #每个帖子的XML注释
def dialogue_act_features(post): #特征提取器 features = {} for word in nltk.word_tokenize(post): features['contains(%s)' % word.lower()] = True return features featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts] ({'contains(gay)': True, 'contains(im)': True, 'contains(left)': True, 'contains(name)': True, 'contains(now)': True, 'contains(this)': True, 'contains(with)': True}, 'Statement') #陈述句
size = int(len(featuresets) * 0.1) #分类器 train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) 0.668
#识别文字蕴涵
(Recognizing textual entailment, RTE)是判断文本T内的一个给定片段是否继承另一个叫做“假设”的文本。迄今为止,已经有4个RTE挑战赛,在那里共享的开发和测试数据会提供给参赛队伍。
def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] extractor = nltk.RTEFeatureExtractor(rtepair) print extractor.text_words set(['Organisation', 'Shanghai', 'Asia', 'four', 'at', 'operation', 'SCO', 'Iran', 'Soviet', 'Davudi', 'fight', 'China', 'association', 'fledgling', 'was', 'that', 'republics', 'former', 'Co', 'representing', 'Russia', 'Parviz', 'central', 'meeting', 'together', 'binds', 'terrorism.'])
print extractor.hyp_words set(['member', 'SCO.', 'China']) print extractor.overlap('word') set([]) print extractor.overlap('ne') set(['China']) print extractor.hyp_extra('word') set(['member'])
#扩展到大型数据集
纯Python的分类不是很快,建议探索NLTK与外部机器学习包的接口技术,
三 评估
测试集
准确度
精确度和召回率
混淆矩阵
交叉验证
四 决策树
熵和信息增益
五 朴素贝叶斯分类器
潜在概率模型
零计数和平滑
非二元特征
独立的朴素性
双重计数的原因
六 最大熵分类器
最大熵模型
熵的最大化
生成式分类器对比条件分类器
七 为语言模式建模
模型告诉我们什么?
八 深入阅读
使用Weka, Mallet, TADM 和 MegaM