感觉处理这种预测题目,数据科学也是一门需要掌握的学科,模型再好,如果喂进去的数据质量很差的话,也不会得到一个优异的评分,这就要求我们对数据保持敏感,从多个维度去思考每个特征之间的直观或者隐藏的关系,要用到的工具很多,慢慢学
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
data = pd.read_csv('F:/ML/kaggle/titanic/train.csv')
test_data = pd.read_csv('F:/ML/kaggle/titanic/test.csv')
def process(data):
data['Age'] = data['Age'].fillna(data['Age'].median())
counts = data['Embarked'].value_counts()
fill = counts.idxmax()
data['Embarked'] = data['Embarked'].fillna(fill)
data.loc[data[&#