tools
pycharm
python3.8
Codes example From
https://www.bilibili.com/video/BV1WD4y1U7og?p=3
师兄发的一个案例分析,我自己手打了逻辑回归,随机森林的代码,发现原作由于版本问题,与我的python版本和编译器不兼容,故在有些地方进行了修改。
数据集来自kaggle官网
https://www.kaggle.com/c/titanic/data
Codes
导入需要的packages
import pandas
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest , f_classif
from sklearn.ensemble import GradientBoostingClassifier
数据预处理
# read data set
titanic = pandas.read_csv("train.csv")
# print(titanic.head(5))
# preprocessed data
# print(titanic.describe())
# fill nan information
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
# male marked as 0,female marked as 1
titanic.loc[titanic["Sex"] == "male" , "Sex"] = 0
titanic.loc[titanic["Sex"] == "female" , "Sex"] = 1
# print(titanic["Embarked"].unique())
# 将Embarked字段缺失值填充为数量最多的S
titanic["Embarked"] = titanic["Embarked"].fillna('S')
# 把S、C、Q分别用数字0、1、2表示
titanic.loc[titanic["Embarked"] == "S" , "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C" , "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q" , "Embarked"] = 2
# print(titanic.describe())
# print(titanic.head())
线性回归*(这个得出来的结果也太差了,我怀疑我错了,但是不知道错在哪里)
# 7 features
predictors = ["Pclass" , "Sex" , "Age" , "SibSp" , "Parch" , "Fare" , "Embarked"]
# linear regression
alg = LinearRegression()
# kf = KFold(titanic.shape[0], n_splits=3, shuffle=True, random_state=1)
kf = KFold(n_splits=3 , shuffle=True , random_state=1)
predictions = []
for train , test in kf.split(titanic[predictions]):
train_predictors = (titanic[predictors].iloc[train , :])
train_target = (titanic["Survived"].iloc[train])
alg.fit(train_predictors , train_target)
test_predictors = alg.predict(titanic[predictors].iloc[test , :])
predictions.append(test_predictors)
# print(predictions)
predictions = np.concatenate(predictions , axis=0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print('linearRegression:' , accuracy)
逻辑回归
# logistic regression
alg = LogisticRegression(random_state=1 , max_iter=5000)
scores = model_selection.cross_val_score(alg , titanic[predictors] , titanic["Survived"] , cv=3)
print('logisticRegression:' , scores.mean())
随机森林
# Random Forest
predictors = ["Pclass" , "Sex" , "Age" , "SibSp" , "Parch" , "Fare" , "Embarked"]
alg = RandomForestClassifier(random_state=1 , n_estimators=10 , min_samples_split=2 , min_samples_leaf=1)
kf = KFold(n_splits=3 , random_state=1 , shuffle=True)
golds = model_selection.cross_val_score(alg , titanic[predictors] , titanic["Survived"] , cv=kf)
print('randomForest_T10:' , golds.mean())
alg = RandomForestClassifier(random_state=1 , n_estimators=50 , min_samples_split=4 , min_samples_leaf=2)
kf = KFold(n_splits=3 , random_state=1 , shuffle=True)
golds = model_selection.cross_val_score(alg , titanic[predictors] , titanic["Survived"] , cv=kf)
print('randomForest_T50:' , golds.mean())
结果: