1.EDA及数据预处理
## 基础函数库
import numpy as np
import pandas as pd
## 绘图函数库
import matplotlib.pyplot as plt
import seaborn as sns
#导入需要用到的数据集
data = pd.read_csv('train.csv')
data.head()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2012/1/19 | MountGinini | 12.1 | 23.1 | 0.0 | NaN | NaN | W | 30.0 | N | ... | 60.0 | 54.0 | NaN | NaN | NaN | NaN | 17.0 | 22.0 | No | No |
| 1 | 2015/4/13 | Nhil | 10.2 | 24.7 | 0.0 | NaN | NaN | E | 39.0 | E | ... | 63.0 | 33.0 | 1021.9 | 1017.9 | NaN | NaN | 12.5 | 23.7 | No | Yes |
| 2 | 2010/8/5 | Nuriootpa | -0.4 | 11.0 | 3.6 | 0.4 | 1.6 | W | 28.0 | N | ... | 97.0 | 78.0 | 1025.9 | 1025.3 | 7.0 | 8.0 | 3.9 | 9.0 | Yes | No |
| 3 | 2013/3/18 | Adelaide | 13.2 | 22.6 | 0.0 | 15.4 | 11.0 | SE | 44.0 | E | ... | 47.0 | 34.0 | 1025.0 | 1022.2 | NaN | NaN | 15.2 | 21.7 | No | No |
| 4 | 2011/2/16 | Sale | 14.1 | 28.6 | 0.0 | 6.6 | 6.7 | E | 28.0 | NE | ... | 92.0 | 42.0 | 1018.0 | 1014.1 | 4.0 | 7.0 | 19.1 | 28.2 | No | No |
5 rows × 23 columns
data = data.fillna(-1)
# 用-1填补缺失值
data.tail()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 106639 | 2011/5/23 | Launceston | 10.1 | 16.1 | 15.8 | -1.0 | -1.0 | SE | 31.0 | NNW | ... | 99.0 | 86.0 | 999.2 | 995.2 | -1.0 | -1.0 | 13.0 | 15.6 | Yes | Yes |
| 106640 | 2014/12/9 | GoldCoast | 19.3 | 31.7 | 36.0 | -1.0 | -1.0 | SE | 80.0 | NNW | ... | 75.0 | 76.0 | 1013.8 | 1010.0 | -1.0 | -1.0 | 26.0 | 25.8 | Yes | Yes |
| 106641 | 2014/10/7 | Wollongong | 17.5 | 22.2 | 1.2 | -1.0 | -1.0 | WNW | 65.0 | WNW | ... | 61.0 | 56.0 | 1008.2 | 1008.2 | -1.0 | -1.0 | 17.8 | 21.4 | Yes | No |
| 106642 | 2012/1/16 | Newcastle | 17.6 | 27.0 | 3.0 | -1.0 | -1.0 | -1 | -1.0 | NE | ... | 68.0 | 88.0 | -1.0 | -1.0 | 6.0 | 5.0 | 22.6 | 26.4 | Yes | No |
| 106643 | 2014/10/21 | AliceSprings | 16.3 | 37.9 | 0.0 | 14.2 | 12.2 | ESE | 41.0 | NNE | ... | 8.0 | 6.0 | 1017.9 | 1014.0 | 0.0 | 1.0 | 32.2 | 35.7 | No | No |
5 rows × 23 columns
data['RainTomorrow'].value_counts()
No 82786
Yes 23858
Name: RainTomorrow, dtype: int64
cate_features = list(data.select_dtypes(include='object').columns)
cate_features.remove('RainTomorrow')
num_features = list(data.select_dtypes(exclude='object').columns)
## 选取三个特征与标签组合的散点可视化
sns.pairplot(data=data[['Rainfall','Evaporation','Sunshine','RainTomorrow']] , diag_kind='hist', hue= 'RainTomorrow')
plt.show()
# 从结果可以看出,在这3个变量中,sunshine的区分能力稍好

for col in data[num_features].columns:
sns.boxplot(x='RainTomorrow', y=col, saturation=0.5, palette='pastel', data=data)
plt.title(col)
plt.show()
# 利用箱型图我们也可以得到不同类别在不同特征上的分布差异情况。我们可以发现Sunshine,Humidity3pm,Cloud9am,Cloud3pm的区分能力较强
















cate_features
['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
for cate in cate_features[1:]:
print(cate,len(data[cate].unique()))
Location 49
WindGustDir 17
WindDir9am 17
WindDir3pm 17
RainToday 3
loc=data[['Date','Location','RainTomorrow']].groupby(['Location','RainTomorrow']).count().unstack()
loc.columns = ['No','Yes']
loc['ratio'] = loc['Yes']/(loc['Yes']+loc['No'])
loc.sort_values(by='ratio',ascending=False)
# 可以看出,有些地区降雨的概率要大一些,有些地区则很低
| No | Yes | ratio | |
|---|---|---|---|
| Location | |||
| Portland | 1443 | 818 | 0.361787 |
| Walpole | 1433 | 722 | 0.335035 |
| Cairns | 1555 | 700 | 0.310421 |
| Dartmoor | 1530 | 679 | 0.307379 |
| MountGambier | 1576 | 692 | 0.305115 |
| NorfolkIsland | 1532 | 669 | 0.303953 |
| Witchcliffe | 1544 | 663 | 0.300408 |
| CoffsHarbour | 1589 | 677 | 0.298764 |
| Albany | 1580 | 665 | 0.296214 |
| MountGinini | 1579 | 623 | 0.282925 |
| Williamtown | 1361 | 532 | 0.281035 |
| NorahHead | 1587 | 607 | 0.276664 |
| Darwin | 1739 | 645 | 0.270554 |
| Ballarat | 1678 | 597 | 0.262418 |
| Melbourne |

本文深入探讨了XGBoost的建模流程,包括EDA、数据预处理、模型构建和调参。重点讲解了XGBoost的重要参数如eta、min_child_weight、max_depth等,以及其原理,如泰勒展开优化、CART树构建和并行策略。此外,还阐述了XGBoost如何通过迭代预测误差来串联决策树,以达到更精确的预测结果。
最低0.47元/天 解锁文章

8939

被折叠的 条评论
为什么被折叠?



