关于最大熵模型的介绍请看:http://www.cnblogs.com/hexinuaa/p/3353479.html
下面是GIS训练算法的python实现,代码不到100行。
from collections import defaultdict
import math
class MaxEnt(object):
def __init__(self):
self.feats = defaultdict(int)
self.trainset = []
self.labels = set()
def load_data(self,file):
for line in open(file):
fields = line.strip().split()
# at least two columns
if len(fields) < 2: continue
# the first column is label
label = fields[0]
self.labels.add(label)
for f in set(fields[1:]):
# (label,f) tuple is feature
self.feats[(label,f)] += 1
self.trainset.append(fields)
def _initparams(self):
self.size = len(self.trainset)
# M param for GIS training algorithm
self.M = max([len(record)-1 for record in self.trainset])
self.ep_ = [0.0]*len(self.feats)
for i,f in enumerate(self.feats):
# calculate feature expectation on empirical distribution
self.ep_[i] = float(self.feats[f])/float(self.size)
# each feature function correspond to id
self.feats[f] = i
# init weight for each feature
self.w = [0.0]*len(self.feats)
self.lastw = self.w
def probwgt(self,features,label):
wgt = 0.0
for f in features:
if (label,f) in self.feats:
wgt += self.w[self.feats[(label,f)]]
return math.exp(wgt)
"""
calculate feature expectation on model distribution
"""
def Ep(self):
ep = [0.0]*len(self.feats)
for record in self.trainset:
features = record[1:]
# calculate p(y|x)