pro = """
Variable Definition Key
survival Survival 0 = No, 1 = Yes, 离散
pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd, 离散
sex Sex 离散
Age Age in years 连续
sibsp # of siblings / spouses aboard the Titanic 连续
parch # of parents / children aboard the Titanic 连续
fare Passenger fare 连续
cabin Cabin number 离散
embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton, 离散
"""
import pandas as pd
import numpy as np
import math
class Row:
def __init__(self, PassengerId, Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, Embarked, Survived):
self.PassengerId = PassengerId
self.Pclass = Pclass
self.Sex = Sex
self.Age = Age
self.SibSp = SibSp
self.Parch = Parch
self.Fare = Fare
self.Cabin = Cabin
self.Embarked = Embarked
self.Survived = Survived
def __repr__(self):
return str([self.PassengerId, self.Pclass, self.Sex, self.Age, self.SibSp, self.Parch, self.Fare, self.Cabin, self.Embarked, self.Survived])
def data_read_and_filter():
data_train = pd.read_csv("D:\\!data\\Titanic\\train.csv")
data_test = pd.read_csv("D:\\!data\\Titanic\\test.csv")
data_train = data_train[['Survived', 'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
data_test = data_test[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
data_train = np.array(data_train).tolist()
data_test = np.array(data_test).tolist()
data_train = [Row(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], s[0]) for s in data_train]
data_test = [Row(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], -1) for s in data_test]
return data_train, data_test
def LL_grow(LL, a):
LL1 = []
for L in LL:
D = {}
for s in L:
key = getattr(s, a)
if key not in D:
D[key] = []
D[key].append(s)
LL1 += list(D.values())
return LL1
def entropy(x):
if x <= 0:
return 0
else:
return x*math.log(x)
def entropy_calculate(LL, a):
LL1 = LL_grow(LL, a)
num_all = sum([len(L) for L in LL1])
res = 0
for L in LL1:
pos = sum([1 for s in L if s.Survived == 1])/len(L)
neg = sum([1 for s in L if s.Survived == 0])/len(L)
res += (-(entropy(pos)+entropy(neg)))*len(L)/num_all
return res
def tree_generate(data_train, T):
#先做最简单的,全部为离散
def build_tree(data_train):
S = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
T = []
LL = [data_train]
while len(S) > 0:
a0 = S[0]
e0 = entropy_calculate(LL, a0)
for i in range(1, len(S)):
a = S[i]
e = entropy_calculate(LL, a)
if e0 > e:
a0, e0 = a, e
T.append(a0)
S.remove(a0)
print("e0: ", e0)
LL = LL_grow(LL, a0)
return T
if __name__ == '__main__':
data_train, data_test = data_read_and_filter()
T = build_tree(data_train)