翻了一下csdn,发现没有用lgb模型处理泰坦尼克的,所以就去写了一下,代码里面注释点的部分是我为了解决报错的各种尝试,就不删除了,哈哈,代码拿走不谢
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
#加载数据集
titanic = pd.read_csv("D:\\AAAAA.泰坦尼克\\train.csv")
print(type(titanic))
#用均值填充缺失年龄
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
#默认进行替换 用数字替换Sex栏的值
#titanic.loc[titanic["Sex"]=="male","Sex"]=0
#titanic.loc[titanic["Sex"]=="female","Sex"]=1
#全都换成0 1 2
#print(titanic["Embarked"].unique())
#titanic["Embarked"]=titanic["Embarked"].fillna('S')
#titanic.loc[titanic["Embarked"]== "S","Embarked"]=0
#titanic.loc[titanic["Embarked"]== "C","Embarked"]=1
#titanic.loc[titanic["Embarked"]== "Q","Embarked"]=2
#为了预测结果需要使用的栏
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
#获取特征值和目标值
X,y=titanic[predictors].iloc[:],titanic["Survived"].iloc[:]
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
encoder.fit(list(X["Sex"].values))
X["Sex"]=encoder.transform(list(X["Sex"].values))
encoder.fit(list(X["Embarked"].values))
X["Embarked"]=encoder.transform(list(X["Embarked"].values))
print(y)
#数据集划分
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.7,random_state=0)
print(y_test)
# 查看类别特征值分布
print(X['Sex'].value_counts())
print(X['Embarked'].value_counts())
#数据格式转换
lgb_train=lgb.Dataset(X_train,y_train)
lgb_eval=lgb.Dataset(X_test,y_test,reference=lgb_train)
#参数设置
boost_round=50#迭代次数
early_stop_rounds=10#验证数据如果在early_stop_rounds轮中未提高,则提前停止
params={
'boost_type':'gobbet',#设置提升类型
'objective': 'regression', # 目标函数
'metric': {'l2', 'auc'}, # 评估函数
'num_leaves': 31, # 叶子节点数
'learning_rate': 0.05, # 学习速率
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
result={}
gbm=lgb.train(
params,
lgb_train,
num_boost_round=boost_round,
valid_sets=(lgb_train,lgb_eval),
valid_names=('validate','train'),
early_stopping_rounds=early_stop_rounds,
evals_result=result
)
test2 = pd.read_csv("D:\\AAAAA.泰坦尼克\\test.csv")
encoder.fit(list(test2["Sex"].values))
test2["Sex"]=encoder.transform(list(test2["Sex"].values))
encoder.fit(list(test2["Embarked"].values))
test2["Embarked"]=encoder.transform(list(test2["Embarked"].values))
y_gbm_pred=gbm.predict(test2[predictors])
print("未改前",y_gbm_pred)
y_gbm_pred[y_gbm_pred > .5] = 1
y_gbm_pred[y_gbm_pred <=.5] = 0
print("改后 ",y_gbm_pred)
test1=pd.read_csv("D:\\AAAAA.泰坦尼克\\gender_submission.csv")
print(test1)
#date=pd.DataFrame(
# {
# "Passenger":test2["PassengerId"],
# 'Survived':y_gbm_pred
#
# }
#)
#date.to_csv("submission.csv",index=False)
gbm_score=accuracy_score(test1["Survived"],y_gbm_pred) #准确率
print('gbm_score:',gbm_score)
#prediction =gbm.predict(test[predictors])
#accuracy = sum(prediction == test1["Survived"]) / len(prediction)
#print(accuracy)