# python xgboost分析婚外情几率

1 我用的是Anaconda，先安装xgboost
2 数据集：（课程作业，我也不知道这个数据集哪里来的）

3 先使用随机森林测试性能，这样和xgboost好对比性能

#coding=utf-8
import pandas as pd
from pandas import Series,DataFrame
import random
import numpy as np
import time
from datetime import date
import datetime as dt
from numpy import nan as NA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import auc
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

#读取数据
gdata = pd.read_csv("Affairs.csv",header=0)

print(gdata.shape)#观察数据情况
print(gdata.head(5))

gdata.isnull().any()#查看缺失值，没有缺失值，非常OK！

#将字符串全部变为数值型变量
gdata.gender[gdata['gender']=="male"] = 0
gdata.gender[gdata['gender']=="female"] = 1

gdata.children[gdata['children']=="no"] = 0
gdata.children[gdata['children']=="yes"] = 1

#为了和xgboost比较，将标签修改为0、1
gdata.rating[gdata['rating']<=3] = 0
gdata.rating[gdata['rating']>3] = 1

#随机森林回归
Rating = gdata['rating'].values
Feature = gdata[['affairs','gender','age','yearsmarried','children','religiousness','education','occupation']].values
rf = RandomForestRegressor()
rf.fit(Feature,Rating)#进行模型的训练
predict = rf.predict(Feature)

#均方误差 MAD
def MAD(target, predictions):
squared_deviation = np.power(target - predictions, 2)
return np.mean(squared_deviation)
print( MAD(Rating, predict) )

fpr,tpr,thresholds = metrics.roc_curve(Rating, predict)
AUC = metrics.auc(fpr, tpr)
print(AUC)

MAD：0.0424662760667
AUC测试：0.995788061704

xgboost测试代码：

dtrain = xgb.DMatrix( Feature, label=label)
dtest = dtrain

param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist  = [(dtest,'eval'), (dtrain,'train')]

num_round = 200
bst = xgb.train( param, dtrain, num_round, evallist )     

[196] eval-auc:0.998042 train-auc:0.998042
[197] eval-auc:0.998042 train-auc:0.998042
[198] eval-auc:0.998042 train-auc:0.998042
[199] eval-auc:0.998028 train-auc:0.998028