实例:Titanic生存率预测实例(通过sklearn实现Knn,决策树,线性回归)
import pandas as pd
from IPython.display import display
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
for column in train.columns:
print("name:{0} miss rate:{1:.2f} ".format(column,1-train[column].count()/len(train)))
for column in test.columns:
print("name:{0} miss rate:{1:.2f} ".format(column,1-test[column].count()/len(train)))
train,test= train.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin']),test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')
train['Age'] = train['Age'].fillna(int(train['Age'].mean()))
test['Age'] = test['Age'].fillna(int(test['Age'].mean()))
test['Fare'] = test['Fare'].fillna(float(test['Fare'].dropna().mode()[0]))
train.loc[train["Sex"]=="male","Sex"] = 0;
train.loc[train["Sex"]=="female","Sex"] = 1;
train.loc[train["Embarked"]=="S","Embarked"] = 0;
train.loc[train["Embarked"]=="C","Embarked"] = 1;
train.loc[train["Embarked"]=="Q","Embarked"] = 2;
test.loc[test