import pandas as pd
import numpy as py
import matplotlib.pyplot as plt
from IPython.display import Image
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.rcParams['figure.figsize'] = (10, 6)
train = pd.read_csv('train.csv')
train.head()
# 对分类变量缺失值:填充某个缺失值字符(NA)、用最多类别的进行填充
# 对连续变量缺失值:填充均值、中位数、众数
train.isnull().sum()
train['Age'] = train['Age'].fillna(train.Age.mean())
train['Cabin'] = train['Cabin'].fillna('NA')
train['Embarked'].value_counts()
train['Embarked'] = train['Embarked'].fillna('S')
train.isnull().sum()
# 任务三:编码分类变量
data = train[['Pclass','Sex','Age','SibSp','Parch','Fare', 'Embarked']]
data = pd.get_dummies(data)
data.head()
data.shape
from sklearn.model_selection import train_test_split
train_test_split? #查看文件
X = data
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = y) #按y的比例划分数据集
X_train.shape, X_test.shape
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# LogisticRegression
Ir = LogisticRegression()
Ir.fit(X_train, y_train)
print('training set score: {}'.format(Ir.score(X_train, y_train)))
print("Testing set score: {}".format(Ir.score(X_test, y_test)))
#调参
Ir2 = LogisticRegression(C=100).fit(X_train, y_train)
Ir2.score(X_train, y_train)
print("test set score : {:.2f}".format(Ir2.score(X_test, y_test)))
# 默认参数的随机森林分类模型
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
print("test set score : {:.2f}".format(rfc.score(X_train, y_train)))
print("test set score : {:.2f}".format(rfc.score(X_test, y_test)))
# n_estimators, default=100. The number of trees in the forest.
# max_depth, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
rfc1 = RandomForestClassifier(n_estimators = 200, max_depth = 8)
rfc1.fit(X_train, y_train)
print("test set score : {:.2f}".format(rfc1.score(X_train, y_train)))
print("test set score : {:.2f}".format(rfc1.score(X_test, y_test)))
# 输出模型预测结果
pre = Ir.predict(X_train)
pre[:10]