信息熵:
划分依据:
信息增益:
API:
代码实现:
ssl._create_default_https_context = ssl._create_unverified_context
# 获取数据
df = pd.read_csv('https://datahub.csail.mit.edu/download/jander/historic/file/titanic.csv')
print(df.head())
# 数据基本处理
df['age'].fillna(df['age'].mean(), inplace=True, axis=0)
x = df[['pclass','age','sex']]
y = df['survived']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
# 数据预处理
dict = DictVectorizer()
x_train = dict.fit_transform(x_train.to_dict(orient='records'))
print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient='records'))
# 机器学习 -- 决策树
estimator = DecisionTreeClassifier()
estimator.fit(x_train,y_train)
y_pre = estimator.predict(x_test)
print('预测的准确率为:\n',estimator.score(x_test,y_test))
print('预测的结果为:\n',y_pre)
# 树的结构保存、
export_graphviz(estimator,out_file='./tree.dot',feature_names=['年龄', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性'])
优缺点:
随机森林:
代码实现:
# 加验证
ssl._create_default_https_context = ssl._create_unverified_context
# 获取数据
df = pd.read_csv('https://datahub.csail.mit.edu/download/jander/historic/file/titanic.csv')
print(df.head())
# 数据基本处理
df['age'].fillna(df['age'].mean(), inplace=True, axis=0)
x = df[['pclass','age','sex']]
y = df['survived']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
# 数据预处理
dict = DictVectorizer()
x_train = dict.fit_transform(x_train.to_dict(orient='records'))
print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient='records'))
# 机器学习 -- 随机森林
rf = RandomForestClassifier()
parm = {'n_estimators':[100,200,300,400,500,600],'max_depth':[5,7,12,15,18,21]}
estimator = GridSearchCV(rf,param_grid=parm,cv=10)
estimator.fit(x_train,y_train)
print('准确率为:\n',estimator.score(x_test,y_test))
print('准确率最高的模型:\n',estimator.best_estimator_)
print('最好的参数:\n',estimator.best_params_)