73、Titanic

pro = """
Variable	Definition	                                Key                   
survival	Survival	                                0 = No, 1 = Yes, 离散
pclass	    Ticket class	                            1 = 1st, 2 = 2nd, 3 = 3rd, 离散
sex     	Sex	                                        离散
Age	        Age in years	                            连续
sibsp	    # of siblings / spouses aboard the Titanic	连续
parch	    # of parents / children aboard the Titanic	连续    
fare	    Passenger fare	                            连续
cabin	    Cabin number	                            离散
embarked	Port of Embarkation	                        C = Cherbourg, Q = Queenstown, S = Southampton, 离散
"""

import pandas as pd
import numpy as np
import math

class Row:
    def __init__(self, PassengerId, Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, Embarked, Survived):
        self.PassengerId = PassengerId
        self.Pclass = Pclass
        self.Sex = Sex
        self.Age = Age
        self.SibSp = SibSp
        self.Parch = Parch
        self.Fare = Fare
        self.Cabin = Cabin
        self.Embarked = Embarked
        self.Survived = Survived

    def __repr__(self):
        return str([self.PassengerId, self.Pclass, self.Sex, self.Age, self.SibSp, self.Parch, self.Fare, self.Cabin, self.Embarked, self.Survived])

def data_read_and_filter():
    data_train = pd.read_csv("D:\\!data\\Titanic\\train.csv")
    data_test = pd.read_csv("D:\\!data\\Titanic\\test.csv")

    data_train = data_train[['Survived', 'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    data_test = data_test[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

    data_train = np.array(data_train).tolist()
    data_test = np.array(data_test).tolist()

    data_train = [Row(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], s[0]) for s in data_train]
    data_test = [Row(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], -1) for s in data_test]

    return data_train, data_test

def LL_grow(LL, a):
    LL1 = []
    for L in LL:
        D = {}
        for s in L:
            key = getattr(s, a)
            if key not in D:
                D[key] = []
            D[key].append(s)
        LL1 += list(D.values())
    return LL1

def entropy(x):
    if x <= 0:
        return 0
    else:
        return x*math.log(x)

def entropy_calculate(LL, a):
    LL1 = LL_grow(LL, a)
    num_all = sum([len(L) for L in LL1])
    res = 0
    for L in LL1:
        pos = sum([1 for s in L if s.Survived == 1])/len(L)
        neg = sum([1 for s in L if s.Survived == 0])/len(L)
        res += (-(entropy(pos)+entropy(neg)))*len(L)/num_all
    return res

def tree_generate(data_train, T):



#先做最简单的,全部为离散
def build_tree(data_train):
    S = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
    T = []

    LL = [data_train]

    while len(S) > 0:
        a0 = S[0]
        e0 = entropy_calculate(LL, a0)
        for i in range(1, len(S)):
            a = S[i]
            e = entropy_calculate(LL, a)
            if e0 > e:
                a0, e0 = a, e
        T.append(a0)
        S.remove(a0)
        print("e0: ", e0)
        LL = LL_grow(LL, a0)

    return T

if __name__ == '__main__':
    data_train, data_test = data_read_and_filter()
    T = build_tree(data_train)


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值