#导入相关库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# 训练数据
data_train=pd.read_csv("C:/Users/Dell/Desktop/Data/train.csv")
# 测试数据
data_test = pd.read_csv("C:/Users/Dell/Desktop/Data/test.csv")
# 查看各列属性的数据量和缺失情况
print(data_train.info())
print(data_test.info())
from sklearn.ensemble import RandomForestRegressor
def set_missing_age(df):
# 把数值类型特征取出来,放入随机森林中进行训练
age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
# 乘客分成已知年龄和未知年龄两个部分
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values
# 目标数据y
y = known_age[:,0]
# 特征属性数据x
x = known_age[:,1:]
# 利用随机森林进行拟合
rfr = RandomForestRegressor(random_state=0,n_estimators=2000,n_jobs=-1)
rfr.fit(x,y)
# 利用训练的模型进行预测
predictedAges = rfr.predict(unknown_age[:,1::])
# 填补缺失的原始数据
df.loc[(df.Age.isnull()),'Age'] = predictedAges
return df
# 年龄缺失值填充
data_train = set_missing_age(data_train)
#删除登录港口缺失的两条数据
data = data_train.drop(data_train[data_train.Embarked.isnull()].index)
import sklearn.preprocessing as preprocessing
# 特征因子化
def set_numeralization(data):
# 针对定类性属性进行因子化,分别有Embarked,Sex,Pclass
dummies_Embarked = pd.get_dummies(data['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(data['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(data['Pclass'], prefix='Pclass')
# 将新的属性拼合
df = pd.concat([data, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
# 将旧的属性剔除
df.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)
return df
# 特征归一化
def set_normalization(df):
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'].values.reshape(-1,1))
df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1,1),age_scale_param)
fare_scale_param = scaler.fit(df['Fare'].values.reshape(-1,1))
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1),fare_scale_param)
return df
# 特征工程
data = set_numeralization(data)
data = set_normalization(data)
data_test['Fare'].fillna(data_test['Fare'].median(),inplace=True)
data_test = set_missing_age(data_test)
data_test = data_test.drop(data_test[data_test.Embarked.isnull()].index)
data_test = set_numeralization(data_test)
data_test = set_normalization(data_test)
print(data_test.info())
print(data.info())
#from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
y = data["Survived"]
features = ["Pclass_1", "Pclass_2", "Pclass_3", "Sex_male", "Sex_female", "SibSp", "Parch", "Age", "Fare", "Embarked_C", "Embarked_Q","Embarked_S"]
X = pd.get_dummies(data[features])
X_test = pd.get_dummies(data_test[features])
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaled=scaler.fit(X).transform(X)
X_test_scaled=scaler.fit(X).transform(X_test)
#将模型输入格式变为DataFrame,并查看标准化后的数据情况
X_scaled=pd.DataFrame(X_scaled,columns=features)
#X_scaled.head()
X_test_scaled=pd.DataFrame(X_test_scaled,columns=features)
#X_test_scaled.head()
model = svm.SVC(C=3, kernel='rbf', gamma=0.1)
model.fit(X_scaled, y)
predictions = model.predict(X_test_scaled)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
print (output)
output.to_csv('C:/Users/Dell/Desktop/Data/gender_submission.csv', index=False)
print("Your submission was successfully saved!")
泰坦尼克遇难预测
最新推荐文章于 2024-06-14 23:48:57 发布