使用的是二分类的MNIST,注意为了区别不同的特征,给它们加上了x0、x1。注意f(x,y)的含义。最后运行时间很长,最大熵模型更适合小数据集。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import math
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.cross_validation import train_test_split
class MaxEnt(object):
def init_params(self, X, Y):
self.X_ = X # X是一个列表,二维数组
self.Y_ = set()
self.cal_Pxy_Px(X, Y)
self.N = len(X) # 训练集大小
self.n = len(self.Pxy) # (x,y)对数
self.M = 10000.0 # 书P91,可认为是学习速率
self.build_dict()
self.cal_EPxy()
def cal_Pxy_Px(self, X, Y):
'''
P82
计算联合分布P(X,Y)的经验分布和边缘分布P(X)的经验分布
'''
# defaultdict作用是当查找时key不存在时,返回默认值而不是报错
self.Pxy = defaultdict(int)
self.Px = defaultdict(int)
for i in range(len(X)):
x_, y = X[i], Y[i]
self.Y_.add(y)
for x in x_: # 遍历每个特征
self.Pxy[(x, y)] += 1
self.Px[x] += 1
def build_dict(self):
'''
为所有(x,y)设定一个id,并建立起双向的映射,相同(x,y)无需重复设定
'''
self.id2xy = {}
self.xy2id = {}
for i, (x, y) in enumerate(self.Pxy):
self.id2xy[i] = (x, y)
self.xy2id[(x, y)] = i
def fxy(self, x, y):
'''
判断是否存在(x,y)这一对
'''
return (x, y) in self.xy2id
def cal_pyx(self, X, y):
'''
计算规范化因子Z的内层(6.23)
同时也是(6.22)的分子
'''
result = 0.0
for x in X: # 这里的X是一条数据,遍历每个特征
if self.fxy(x, y):
id = self.xy2id[(x, y)]
result += self.w[id] # f(x,y) = 1,不用乘
return (math.exp(result), y)
def cal_probability(self, X): # 这里的X是一条数据
'''
计算P85的(6.22)
'''
Pyxs = [(self.cal_pyx(X, y)) for y in self.Y_]
Z = sum([prob for prob, y in Pyxs]) # 规范化因子
return [(prob / Z, y) for prob, y in Pyxs]
def cal_EPx(self):
'''
计算P83最上面的期望
'''
self.EPx = [0.0 for i in range(self.n)] # EPx个数 = (x,y)对数
for i, X in enumerate(self.X_): # 遍历每条数据
Pyxs = self.cal_probability(X) # 得到P(y|X),X固定,遍历y(为什么不是特征x而是X存疑)
for x in X: # 遍历每个特征
for Pyx, y in Pyxs: # Pyx是针对每条数据X和每个标签y的值
if self.fxy(x, y):
id = self.xy2id[(x, y)]
self.EPx[id] += Pyx * (1.0 / self.N) # 暗含经验分布的叠加
def cal_EPxy(self):
'''
计算P82最下面的期望
'''
self.EPxy = defaultdict(float)
for id in range(self.n):
(x, y) = self.id2xy[id]
self.EPxy[id] = float(self.Pxy[(x, y)]) / float(self.N)
def train(self, X, Y):
self.init_params(X, Y)
self.w = [0.0 for i in range(self.n)]
max_iteration = 1000
for times in range(max_iteration):
print('iteration times %d' % times)
deltas = []
self.cal_EPx()
for i in range(self.n):
delta = 1 / self.M * math.log(self.EPx[i])
deltas.append(delta)
self.w = [self.w[i] + deltas[i] for i in range(self.n)]
def predict(self, testset):
results = []
for test in testset:
result = self.cal_probability(test)
results.append(max(result, key=lambda x: x[0])[1])
return results
def rebuild_features(features):
'''
将原feature的(a0,a1,a2,a3...)
变成(0_a0,1_a1,2_a2,3_a3)的形式
因为计算f(x,y)需要区分不同的x
'''
new_features = []
for feature in features:
new_feature = []
for i, f in enumerate(feature): # 遍历每一行数据
new_feature.append(str(i) + '_' + str(f))
new_features.append(new_feature)
return new_features
if __name__ == '__main__':
print('Start reading data:')
time1 = time.time()
raw_data = pd.read_csv('data/train_binary.csv', header=0)
data = raw_data.values
imgs = data[:, 1:]
labels = data[:, 0]
train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=11111)
train_features = rebuild_features(train_features)
test_features = rebuild_features(test_features)
print(len(train_features))
print(len(test_features))
# print(train_features[0])
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('Start training:')
met = MaxEnt()
met.train(train_features, train_labels)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('Start predicting:')
test_predict = met.predict(test_features)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
accuracy = sum([test_labels[i] == test_predict[i] for i in range(len(test_labels))]) / len(test_labels)
print('The accuracy is %f!' % accuracy)
'''
output:
Start reading data:
28140
13860
read data cost 42.410567 seconds
Start training:
iteration times 0
iteration times 1
iteration times 2
...(要运行十几个小时,正确率可达97%以上)
'''