文档分类,其实就是根据文档的特征,统计每一个文档中的特征集,从而进行分类
这些特征的选择,可以选择高频词,词的后缀,也可以根据上下文语境,可以结合这个词和这个词的上一个词进行特征提取,还可以使用连续分类器,既考虑已知的标注集,又根据该集合预测新的标注,并加入到历史标注集中,有种半监督的意味。
有监督分类,用到了带正确标注的训练集
"""
过拟合问题。
完善特征集办法:错误分析;建立开发集,将其分为训练集(用于训练模型)和开发测试集(用于错误分析),还有一个测试集
"""
目录
1)高频词作为文档特征集
from nltk.corpus import movie_reviews
documents=[(list(movie_reviews.words(fileid)),category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features=all_words.keys()[:2000] #获取所有文档中的高频词
def document_features(document): #特征提取器:根据所有文档中的高频词获得某个文档特征集
document_words=set(document)
features={}
for word in word_features:
features['contains(%s)' % word]=(word in document_words)
return features
featuresets=[(document_features(d), c) for (d,c) in documents] # d为document, c为category
train_set, test_set=featuresets[100:], featuresets[:100]
classifier=nltk.NaiveBayesClassifier.train(train_set) #根据特征集训练分类器
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(5) #最明显的五个特征
2)找出最明显的后缀
# 找出最常见的后缀
from nltk.corpus import brown
suffix_fdist=nltk.FreqDist()
for word in brown.words():
word=word.lower()
suffix_fdist[word[-1:]]+=1
suffix_fdist[word[-2:]]+=1
suffix_fdist[word[-3:]]+=1
common_suffixes=suffix_fdist.keys()[:100]
# 根据常用后缀定义特征提取函数, 检查给定单词的这些后缀
def pos_features(word):
features={}
for suffix in common_suffixes:
features['endswith(%s)' % suffix]=word.lower().endswith(suffix) #结果是一个True或False的特征集(真值表)
return features
3)结合上下文语境进行分类
# 探索上下文语境,结合当前词语的前面几个词,判断当前词的词性,进行词性标记
def pos_features(sentence,i):
features={"suffix(1)":sentence[i][-1:],
"suffix(2)":sentence[i][-2:],
"suffix(3)":sentence[i][-2:]}
if i==0:
features["prev-word"]="<START>"
else:
features["prev-word"]=sentence[i-1]
return features
tagged_sents=brown.tagged_sents(categories='news')
features=[]
for tagged_sent in tagged_sents:
untagged_sent=nltk.tag.untag(tagged_sent) #去标签化
for i,(word,tag) in enumerate(tagged_sent):
features.append(
(pos_features(untagged_sent,i),tag) ) #对去标签化的tagged_sent,即句子进行特征提取
size=int(len(featuresets) * 0.1)
train_set, test_set=featuresets[size:], featuresets[:size]
classifier=nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)
4)使用连续分类器进行词性标注
def pos_features(sentence,i,history): #特征集提取
features={"suffix(1)":sentence[i][-1:],
"suffix(2)":sentence[i][-2:],
"suffix(3)":sentence[i][-2:]}
if i==0:
features["prev-word"]="<START>"
features["prevz-tag"]="<START>"
else:
features["prev-word"]=sentence[i-1]
features["prevz-tag"]=history[i-1]
return features
class ConsecutivePosTagger(nltk.TaggerI): #继承自nltk.TaggerI
def __init__(self, train_sets): #构造函数
for tagged_sent in train_sets:
untagged_sent=nltk.tag.untag(tagged_sent) #去标签化
history=[] #提供一个我们到目前为止已经为句子预测的标记的链表
for i,(word,tag) in enumerate(tagged_sent):
features=pos_features(untagged_sent,i,history)
train_set.append((featureset,tag))
history.append(tag)
self.classifier=nltk.NaiveBayesClassifier.train(train_set)
def tag(self,sentence):
history=[]
for i,word in enumerate(sentence):
featureset=pos_features(sentence,i,history)
tag=self.classifier.classify(featureset)
history.append(tag)
return zip(sentence,history)
tagged_sents=brown.tagged_sents(categories='news')
size=int(len(tagged_sent) * 0.1)
train_set, test_set=tagged_sents[size:], tagged_sents[:size]
classifier=ConsecutivePosTagger(train_set)
# print classifier.evaluate(test_set)
# 根据得到的连续分类器,可以用tag方法对新的句子进行标注
sen="I love and trust you"
sen_word=nltk.word_tokenize(sen)
tagger.tag(sen_word)
# [('I', u'PPSS'), ('love', u'HV'), ('and', u'CC'), ('trust', u'AP'), ('you', u'PPSS')]
5)句子分割器
通过训练标点符号分类器,可以得到一个对句子进行分割的断句器
import nltk
# 句子分割
sents=nltk.corpus.treebank_raw.sents(0
tokens=[]
boundaries=set()
offset=0
for sent in nltk.corpus.treebank_raw.sents():
tokens.extend(sent)
offset+=len(sent)
boundaries.add(offset-1)
def punct_features(tokens,i): #特征提取器
return{ 'next-word-capitalized':tokens[i+1][0].isupper(),
'prevword':tokens[i-1].lower(),
'punct':tokens[i],
'prev-words-is-one-char':len(tokens[i-1])==1
}
# 获取标注集
featuresets=[(punct_features(tokens,i),(i in boundaries))
for i in range(1,len(tokens)-1)
if tokens[i] in '.?!']
# ({'next-word-capitalized': True, 'punct': u'.', 'prev-words-is-one-char': False, 'prevword': u'29'}, True)
#根据这个标注集,训练一个标点符号分类器
size=int(len(featuresets)*0.1)
train_set,test_set=featuresets[size:],featuresets[:size]
classifier=nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)
#用得到的分类器进行断句得到的断句器
def segment_sentences(words):
start=0
sents=[]
for i, word in words:
if word in '.?!' and classifier.classify(words,i)==True:
sents.append(words[start:i+1])
start=i+1
if start<len(words):
sents.append(words[start:])