为了加深对机器学习算法的理解,以及熟悉python,pandas,scikit-learn。现在自己实现一下主要的机器学习算法,程序记录如下:
决策树类的实现程序:
import numpy as np
import pandas as pd
import random as rd
import re
from sklearn import tree
from sklearn import preprocessing
from sklearn import cross_validation
from math import log
class dec_tree():
def __init__(self, name):
self.train_file = name
self.model={}
self.feature_name =[]
def train(self):
data_set,fea_name=load_data_set(self.train_file)
mytree=build_tree(data_set,fea_name)
self.model = mytree
def test(self,x):
data_set,fea_name=load_data_set(self.train_file)
self.feature_name=fea_name
res = tree_test(self.model,x,self.feature_name)
return res
def gen_entro(data_set):
set_len=len(data_set)
data_dic={}
for num in data_set:
if(data_dic.get(num[-1],0)==0):
data_dic[num[-1]]=1
else:
data_dic[num[-1]]+=1
h=0
for num in data_dic.keys():
curr_val= data_dic[num]
h+= -float(curr_val/set_len)*log(float(curr_val)/set_len,2)
return h
def cut_data_set(data_set,fea_idx,value):
new_data_set=[]
for data in data_set:
if(data[fea_idx]==value):
curr_list=[]
if(fea_idx==0):
curr_list=data[(fea_idx+1):]
else:
curr_list=data[:fea_idx]+data[(fea_idx+1):]
new_data_set.append(curr_list)
return new_data_set
def chose_best_fea(data_set):
data_set=np.array(data_set)
fea_len = len(data_set[0])-1
curr_max_gain=0;curr_max_idx=0
for fea_idx in range(fea_len):
curr_list = np.unique(data_set[:,fea_idx])
info_gain = 0
for val in curr_list:
new_data_set=cut_data_set(data_set,fea_idx,val)
prob = len(new_data_set)/len(data_set)
entro = gen_entro(new_data_set)
info_gain+=-prob*entro
curr_gain = gen_entro(data_set)-info_gain
if(curr_gain>curr_max_gain):
curr_max_idx = fea_idx
curr_max_gain = curr_gain
return curr_max_idx
def build_tree(data_set,label):
tree={}
data_set=np.array(data_set)
if(len(data_set[0])==1):
major_label = gen_major_label(data_set)
return major_label
label_set = set(data_set[:,-1])
if(len(label_set)==1):
return data_set[0][-1]
fea_idx = chose_best_fea(data_set)
curr_list = np.unique(data_set[:,fea_idx])
label_name = label[fea_idx]
del label[fea_idx]
for val in curr_list:
new_data_set=cut_data_set(data_set,fea_idx,val)
key=tuple([label_name,val])
tree[key]=build_tree(new_data_set,label)
return tree
def gen_major_label(data_set):
data_set = np.array(data_set)
label_array = data_set[:,-1]
label_dic={}
for num in data_set:
if(label_dic.get(num[-1],0)==0):
label_dic[num[-1]]=1
else:
label_dic[num[-1]]+=1
curr_max=0 ;curr_max_label=0
for key in label_dic.keys():
if(label_dic[key]>curr_max):
curr_max = label_dic[key]
curr_max_label = key
return curr_max_label
def load_data_set(file_name):
df = pd.read_csv(file_name,header=None)
df['label']=pd.factorize(df[4])[0]
del df[4]
#data_set = df.values
data_set=[[1,1,1],[1,1,1],[1,0,0],[0,1,0],[0,1,0]]
fea_name = ['fea0','fea1','fea2','fea3']
return data_set,fea_name
def tree_test(model,x,label_name):
fea_len = len(x)
curr_model=model
for i in range(fea_len):
curr_fea=x[i]
curr_key=tuple([label_name[i],curr_fea])
curr_res = curr_model[curr_key]
if(type(curr_res).__name__!='dict'):
return curr_res
else:
curr_model=curr_res
测试程序:
import tree
model = tree.dec_tree('flower.csv')
model.train()
x=[1,1,1]
res=model.test(x)
print res