集成学习_案例:PUBG绝地求生

项目实现

  • 你必须创建⼀个模型,根据他们的最终统计数据预测玩家的排名,从1(第⼀名)到0(最后⼀名)。
  • 最后结果通过平均绝对误差(MAE)进⾏评估,即通过预测的winPlacePerc和真实的winPlacePerc之间的平均绝对误 差
  • 关于MAE: sklearn.metrics.mean_absolute_error
    在这里插入图片描述

获取数据、基本数据信息查看

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
train = pd.read_csv("./data/train_V2.csv")
train.head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...revivesrideDistanceroadKillsswimDistanceteamKillsvehicleDestroyswalkDistanceweaponsAcquiredwinPointswinPlacePerc
07f96b2f878858a4d4b580de459bea10357fd1a4a91000.0000060...00.000000.0000244.80114660.4444
1eef90569b9d03c684d5656442f9eaeb375fc57110c0091.4700057...00.0045011.04001434.00500.6400
21eaf90ac73de726a4a42c3245a74110163d8bb94ae1068.0000047...00.000000.0000161.80200.7755
34616d365dd2853a930a9c79cd721f1f1f4ef412d7e0032.9000075...00.000000.0000202.70300.1667
4315c96c26c9aacde04010b3458dd6dc8ff871e21e600100.0000045...00.000000.000049.75200.1875

5 rows × 29 columns

train.tail()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...revivesrideDistanceroadKillsswimDistanceteamKillsvehicleDestroyswalkDistanceweaponsAcquiredwinPointswinPlacePerc
4446961afff7f652dbc10d238e426f50de718492834ce5635000.0000074...01292.000.000001019.0315070.1786
4446962f4197cf374e6c0408cdb5c46b2acee854b837376d90144.1500069...00.000.0000081.7600.2935
4446963e1948b1295c88ae26ac84bdf7cef6d0cd12784f1ab0059.0600066...00.002.18400788.7400.4815
4446964cc032cdd73b7acc2223f35411394c9c701d0ad758a04180.4011211...20.000.000002748.0800.8000
44469650d8e7ed728b6fd8c74f72fedf5ff62a16aabcc095c02268.0000118...01369.000.000001244.0500.5464

5 rows × 29 columns

train.describe()
assistsboostsdamageDealtDBNOsheadshotKillshealskillPlacekillPointskillskillStreaks...revivesrideDistanceroadKillsswimDistanceteamKillsvehicleDestroyswalkDistanceweaponsAcquiredwinPointswinPlacePerc
count4.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+06...4.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446966e+064.446965e+06
mean2.338149e-011.106908e+001.307171e+026.578755e-012.268196e-011.370147e+004.759935e+015.050060e+029.247833e-015.439551e-01...1.646590e-016.061157e+023.496091e-034.509322e+002.386841e-027.918208e-031.154218e+033.660488e+006.064601e+024.728216e-01
std5.885731e-011.715794e+001.707806e+021.145743e+006.021553e-012.679982e+002.746294e+016.275049e+021.558445e+007.109721e-01...4.721671e-011.498344e+037.337297e-023.050220e+011.673935e-019.261157e-021.183497e+032.456544e+007.397004e+023.074050e-01
min0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+001.000000e+000.000000e+000.000000e+000.000000e+00...0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
25%0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+002.400000e+010.000000e+000.000000e+000.000000e+00...0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+001.551000e+022.000000e+000.000000e+002.000000e-01
50%0.000000e+000.000000e+008.424000e+010.000000e+000.000000e+000.000000e+004.700000e+010.000000e+000.000000e+000.000000e+00...0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+006.856000e+023.000000e+000.000000e+004.583000e-01
75%0.000000e+002.000000e+001.860000e+021.000000e+000.000000e+002.000000e+007.100000e+011.172000e+031.000000e+001.000000e+00...0.000000e+001.909750e-010.000000e+000.000000e+000.000000e+000.000000e+001.976000e+035.000000e+001.495000e+037.407000e-01
max2.200000e+013.300000e+016.616000e+035.300000e+016.400000e+018.000000e+011.010000e+022.170000e+037.200000e+012.000000e+01...3.900000e+014.071000e+041.800000e+013.823000e+031.200000e+015.000000e+002.578000e+042.360000e+022.013000e+031.000000e+00

8 rows × 25 columns

train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB
# 查看一共要多少条数据
train.shape

(4446966, 29)

# 有多少场比赛
np.unique(train["matchId"]).shape

(47965,)

# 有多少支队伍
np.unique(train["groupId"]).shape

(2026745,)

数据基本处理

数据缺失值处理

# 判断哪列有缺失值,发现只有winPlacePerc有 
np.any(train.isnull())
Id                 False
groupId            False
matchId            False
assists            False
boosts             False
damageDealt        False
DBNOs              False
headshotKills      False
heals              False
killPlace          False
killPoints         False
kills              False
killStreaks        False
longestKill        False
matchDuration      False
matchType          False
maxPlace           False
numGroups          False
rankPoints         False
revives            False
rideDistance       False
roadKills          False
swimDistance       False
teamKills          False
vehicleDestroys    False
walkDistance       False
weaponsAcquired    False
winPoints          False
winPlacePerc        True
dtype: bool
# 查找缺失值
train[train["winPlacePerc"].isnull()]
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...revivesrideDistanceroadKillsswimDistanceteamKillsvehicleDestroyswalkDistanceweaponsAcquiredwinPointswinPlacePerc
2744604f70c74418bb06412dfbede33f92b224a123c53e008000.00001...00.000.0000.000NaN

1 rows × 29 columns

# 删除
train = train.drop(2744604)
train.shape

(4446965, 29)

特征数据规范化处理

查看每场比赛参加的人数
count = train.groupby("matchId")["matchId"].transform("count")
train["playersJoined"] = count
train.head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...rideDistanceroadKillsswimDistanceteamKillsvehicleDestroyswalkDistanceweaponsAcquiredwinPointswinPlacePercplayersJoined
07f96b2f878858a4d4b580de459bea10357fd1a4a91000.0000060...0.000000.0000244.80114660.444496
1eef90569b9d03c684d5656442f9eaeb375fc57110c0091.4700057...0.0045011.04001434.00500.640091
21eaf90ac73de726a4a42c3245a74110163d8bb94ae1068.0000047...0.000000.0000161.80200.775598
34616d365dd2853a930a9c79cd721f1f1f4ef412d7e0032.9000075...0.000000.0000202.70300.166791
4315c96c26c9aacde04010b3458dd6dc8ff871e21e600100.0000045...0.000000.000049.75200.187597

5 rows × 30 columns

train["playersJoined"].sort_values().head()
1206365    2
2109739    2
3956552    5
3620228    5
696000     5
Name: playersJoined, dtype: int64
plt.figure(figsize=(20, 8))
sns.countplot(train["playersJoined"])
plt.grid()
plt.show()

在这里插入图片描述

# train[train["playersJoined"]>=75]["playersJoined"]
plt.figure(figsize=(20, 8))
sns.countplot(train[train["playersJoined"]>=75]["playersJoined"])
plt.grid()
plt.show()

在这里插入图片描述

规范化输出部分数据
train["killsNorm"] = train["kills"] * ((100-train["playersJoined"])/100+1)
train["damageDealtNorm"] = train["damageDealt"] * ((100-train["playersJoined"])/100+1)
train["maxPlaceNorm"] = train["maxPlace"] * ((100-train["playersJoined"])/100+1)
train["matchDurationNorm"] = train["matchDuration"] * ((100-train["playersJoined"])/100+1)
train.head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...vehicleDestroyswalkDistanceweaponsAcquiredwinPointswinPlacePercplayersJoinedkillsNormdamageDealtNormmaxPlaceNormmatchDurationNorm
07f96b2f878858a4d4b580de459bea10357fd1a4a91000.0000060...0244.80114660.4444960.000.000029.121358.24
1eef90569b9d03c684d5656442f9eaeb375fc57110c0091.4700057...01434.00500.6400910.0099.702328.341936.93
21eaf90ac73de726a4a42c3245a74110163d8bb94ae1068.0000047...0161.80200.7755980.0069.360051.001344.36
34616d365dd2853a930a9c79cd721f1f1f4ef412d7e0032.9000075...0202.70300.1667910.0035.861033.791565.24
4315c96c26c9aacde04010b3458dd6dc8ff871e21e600100.0000045...049.75200.1875971.03103.000099.911466.72

5 rows × 34 columns

# 比较经过规范化的特征值和原始特征值的值
to_show = ['Id', 'kills','killsNorm','damageDealt', 'damageDealtNorm', 'maxPlace', 'maxPlaceNorm', 'matchDuration', 'matchDurationNorm']
train[to_show][0:11]
IdkillskillsNormdamageDealtdamageDealtNormmaxPlacemaxPlaceNormmatchDurationmatchDurationNorm
07f96b2f878858a00.000.0000.000002829.1213061358.24
1eef90569b9d03c00.0091.47099.702302628.3417771936.93
21eaf90ac73de7200.0068.00069.360005051.0013181344.36
34616d365dd285300.0032.90035.861003133.7914361565.24
4315c96c26c9aac11.03100.000103.000009799.9114241466.72
5ff79c12f32650611.05100.000105.000002829.4013951464.75
695959be0e21ca300.000.0000.000002828.8413161355.48
7311b84c6ff439000.008.5388.879529699.8419672045.68
81a68204ccf989100.0051.60053.148002828.8413751416.25
9e5bb5a4358725300.0037.27038.388102929.8719301987.90
102b574d4397281300.0028.38028.663802929.2918111829.11

部分变量合成

train["healsandboosts"] = train["heals"] + train["boosts"]
train[["heals", "boosts", "healsandboosts"]].tail(10)
healsboostshealsandboosts
4446956101
4446957011
4446958000
4446959000
4446960000
4446961000
4446962011
4446963000
4446964246
4446965123

异常值处理

异常值处理:删除有击杀,但是完全没有移动的玩家
train["totalDistance"] = train["rideDistance"] + train["walkDistance"] + train["swimDistance"]
train.head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...winPointswinPlacePercplayersJoinedkillsNormdamageDealtNormmaxPlaceNormmatchDurationNormhealsandbooststotalDistancekillwithoutMoving
07f96b2f878858a4d4b580de459bea10357fd1a4a91000.0000060...14660.4444960.000.000029.121358.240244.8000False
1eef90569b9d03c684d5656442f9eaeb375fc57110c0091.4700057...00.6400910.0099.702328.341936.9301445.0445False
21eaf90ac73de726a4a42c3245a74110163d8bb94ae1068.0000047...00.7755980.0069.360051.001344.360161.8000False
34616d365dd2853a930a9c79cd721f1f1f4ef412d7e0032.9000075...00.1667910.0035.861033.791565.240202.7000False
4315c96c26c9aacde04010b3458dd6dc8ff871e21e600100.0000045...00.1875971.03103.000099.911466.72049.7500False

5 rows × 37 columns

# (train["kills"] > 0) & (train["totalDistance"] == 0)
train["killwithoutMoving"] = (train["kills"] > 0) & (train["totalDistance"] == 0)
train[train["killwithoutMoving"] == True].head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...winPointswinPlacePercplayersJoinedkillsNormdamageDealtNormmaxPlaceNormmatchDurationNormhealsandbooststotalDistancekillwithoutMoving
1824b538d514ef24760eb2ce2f43f9d635e7d750e442e200593.000318...00.8571588.52842.06021.30842.0630.0True
66736d3a61da07b7cb2d8119b1544f87904cecf36217df20346.600633...00.6000424.74547.62817.382834.5260.0True
11892550398a8f33db7c3fd0e2abab0afdb6f6d1f0d4904201750.00453...00.89472135.803132.50035.801607.4250.0True
1463158d690ee461e9dea5b6630b33d67dbf34301df5e5300157.800069...15000.0000731.27200.40624.131014.7300.0True
1559149b61fc963d6320f5c5f19d9cc21904cecf36217df00100.001037...00.3000421.58158.00017.382834.5200.0True

5 rows × 37 columns

train[train["killwithoutMoving"] == True].shape

(1535, 37)

train[train["killwithoutMoving"] == True].index
Int64Index([   1824,    6673,   11892,   14631,   15591,   20881,   23298,
              24640,   25659,   30079,
            ...
            4426500, 4429697, 4432954, 4436511, 4437516, 4440232, 4440898,
            4440927, 4441511, 4446682],
           dtype='int64', length=1535)
train.drop(train[train["killwithoutMoving"] == True].index, inplace=True)
train.shape

(4445430, 37)

异常值处理:删除驾车杀敌数异常的数据
# train["roadKills"] > 10
train.drop(train[train["roadKills"] > 10].index, inplace=True)
train.shape

(4445426, 37)

异常值处理:删除玩家在一局中杀敌数超过30人的数据
train[train["kills"] > 30].head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...winPointswinPlacePercplayersJoinedkillsNormdamageDealtNormmaxPlaceNormmatchDurationNormhealsandbooststotalDistancekillwithoutMoving
579789d8253e21ccbbdef7135ed856cd837f05e2a01015f903725.00702...15000.85711664.406854.0014.723308.32048.82False
8779345f76442384931b3627758941d3437f05e2a01015f803087.008273...15001.00001657.045680.0814.723308.3227780.70False
156599746aa7eabf7c865723e7d8250da3f900de1ec39fa52105479.001274...00.70001190.7210355.3120.793398.22723.71False
16025415622257cb44e21a513eeecfe724db413c7c48292c104033.004001...15001.00006257.965565.5411.041164.720718.30False
1801891355613d43e2d0f863cd38c61dbf39c442628f5df5503171.006151...01.00001166.155993.1917.013394.441571.51False

5 rows × 37 columns

train.drop(train[train["kills"] > 30].index, inplace=True)
train.shape

(4445331, 37)

异常值处理:删除爆头率异常数据
train["headshot_rate"] = train["headshotKills"]/train["kills"]
train["headshot_rate"].head()
0    NaN
1    NaN
2    NaN
3    NaN
4    0.0
Name: headshot_rate, dtype: float64
train["headshot_rate"] = train["headshot_rate"].fillna(0)
train.head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...winPlacePercplayersJoinedkillsNormdamageDealtNormmaxPlaceNormmatchDurationNormhealsandbooststotalDistancekillwithoutMovingheadshot_rate
07f96b2f878858a4d4b580de459bea10357fd1a4a91000.0000060...0.4444960.000.000029.121358.240244.8000False0.0
1eef90569b9d03c684d5656442f9eaeb375fc57110c0091.4700057...0.6400910.0099.702328.341936.9301445.0445False0.0
21eaf90ac73de726a4a42c3245a74110163d8bb94ae1068.0000047...0.7755980.0069.360051.001344.360161.8000False0.0
34616d365dd2853a930a9c79cd721f1f1f4ef412d7e0032.9000075...0.1667910.0035.861033.791565.240202.7000False0.0
4315c96c26c9aacde04010b3458dd6dc8ff871e21e600100.0000045...0.1875971.03103.000099.911466.72049.7500False0.0

5 rows × 38 columns

train[(train["headshot_rate"] == 1) & (train["kills"] > 9)].head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...winPlacePercplayersJoinedkillsNormdamageDealtNormmaxPlaceNormmatchDurationNormhealsandbooststotalDistancekillwithoutMovingheadshot_rate
281570ab9d7168570927add05ebde0214ce016a873339c7b231212.081001...0.84629310.701296.8428.891522.6132939.0False1.0
346124044d18fc42fc75fc1dbc2df6a887628107d4c41084351620.0131131...1.00009611.441684.8028.081796.0888142.0False1.0
871244e668a25f5488e35ba8feabfb2a23f6e6581e03ba4f041365.091301...1.00009813.261392.3027.541280.1042105.0False1.0
908815566d8218b705aaa9b056478d71b23a41552d553583251535.0101031...0.96309510.501611.7529.401929.9087948.0False1.0
9634631bd6fd288df4f090584ffa22fe15ba2de992ec7bb8261355.0121021...1.00009610.401409.2028.081473.6883476.0False1.0

5 rows × 38 columns

train[(train["headshot_rate"] == 1) & (train["kills"] > 9)].index
Int64Index([ 281570,  346124,  871244,  908815,  963463, 1079403, 1167959,
            1348164, 1380385, 1483199, 1581850, 1622232, 1753322, 2256755,
            2375749, 2647056, 2825200, 3288424, 3594399, 3926325, 4036281,
            4351048, 4387092, 4428741],
           dtype='int64')
train.drop(train[(train["headshot_rate"] == 1) & (train["kills"] > 9)].index, inplace=True)
train.shape

(4445307, 38)

异常值处理:删除最远杀敌距离异常数据
train[train["longestKill"] >=1000]
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...winPlacePercplayersJoinedkillsNormdamageDealtNormmaxPlaceNormmatchDurationNormhealsandbooststotalDistancekillwithoutMovingheadshot_rate
20228188e2af7d78af5a34ddeede52c0424346bc63bc67fa03783.95115...0.9231884.48877.96830.242087.6843775.20False0.250000
24000541c2f5c06998079faecf87ab4275634edab75860b3501284.085718...0.53852918.812195.64023.942236.68748.87False0.454545
324313ef390c152bcc3d30fd444be3bbc14f7f8d6cf558b4201028.00009...1.00005114.901531.72019.371040.0202981.00False0.000000
6565539948b058562163c8cb8491112bf60104eeb664494d601410.017503...0.60004125.442241.9009.541734.69029.21False0.312500
8036324e7e6c74e3c57d94698690918933da91b0c3d875f800196.800051...0.0000611.39273.55211.12654.6903159.00False0.000000
8954111f5ba6e0cfb968512ea24b831be35fb0d8b1fc16cf401012.011505...0.90918611.401153.68013.681163.940569.50False0.500000
1172437303a93cfa1f46c8795d39fd0df869c8962b58bb3e321329.300245...0.2857584.26467.60611.36825.023832.50False0.000000
1209416528659ff1c1aec7d1ba83423551dea9386587d5888061640.00701...0.94125222.202427.20076.961827.8062848.00False0.466667
164271291966848e08e2f0ee4fbd27657c917dea22cefe62a322103.0041111...0.50002839.563617.16025.803092.5613235.30False0.173913
20155595ff0c1a9fab2ba2d8119b1544f87904cecf36217df331302.006515...0.60004217.382057.16017.382834.528133.20False0.545455
212212842df3102cb540b7d9b2be15b355b610d78f3affd2e502500.00712...0.00001041.804750.0003.803416.201464.50False0.318182
21524254b9f61bac5eb0abc717b964f3bbe838cb9a3c9459830945.400011...0.57146018.201323.56011.201673.000844.70False0.000000
259271824e0fec84c18e98404855ca02e48e886a8ebb702cf701684.004711...0.57142622.622930.16038.283118.0874851.00False0.307692
29817157f77051c7cef52d6579a630399b54784f7d9a06b51351025.05252...1.0000936.421096.75050.291453.06104085.96False0.333333
3081503f19a76e8d7ac52624d65c529f87cde19b70121c40f301038.000032...0.8571578.581484.34011.44945.230270.00False0.000000
32551715524c154448425674195558ad41bdb6f6d1f0d4904101355.00209...0.57892125.062425.45035.801607.4201039.00False0.142857
3304284d0c286ce498e1717fdd45e612bab3eaaa2f7a360fe702330.00202...1.00005329.403425.10026.461321.53068.02False0.100000
33209600040e53dfe7b5d650661c2351eb72daabf3a7852e600399.020614...0.0000157.40738.15014.802763.9065481.00False0.000000
3552532db638834c62f6f0614b611d6a935ff80300f8262f520517.000010...0.0000308.50878.9006.80612.0001344.88False0.000000
4332473d8857d3d7e31b6085de7a36897e642f997c16d8a0e501685.0113188...0.90912627.842931.90020.883119.8218523.30False0.187500

20 rows × 38 columns

train[train["longestKill"] >=1000].index
Int64Index([ 202281,  240005,  324313,  656553,  803632,  895411, 1172437,
            1209416, 1642712, 2015559, 2122128, 2152425, 2592718, 2981715,
            3081503, 3255171, 3304284, 3320960, 3552532, 4332473],
           dtype='int64')
train.drop(train[train["longestKill"] >=1000].index, inplace=True)
train.shape

(4445287, 38)

异常值处理:删除关于运动距离的异常值
# 行走
train[train["walkDistance"] >=10000].index
Int64Index([  23026,   34344,   49312,   68590,   94400,  125103,  136421,
             136476,  154080,  154128,
            ...
            4181311, 4230073, 4259976, 4284974, 4288445, 4306598, 4370543,
            4380785, 4405009, 4415088],
           dtype='int64', length=219)
train.drop(train[train["walkDistance"] >=10000].index, inplace=True)
train.shape

(4445068, 38)

# 载具
train[train["rideDistance"] >=20000].index
Int64Index([  28588,   63015,   70507,   72763,   95276,  140097,  297186,
             371098,  403647,  426708,
            ...
            4154459, 4191491, 4239725, 4248221, 4256764, 4270943, 4301013,
            4386384, 4404738, 4440261],
           dtype='int64', length=150)
train.drop(train[train["rideDistance"] >=20000].index, inplace=True)
train.shape

(4444918, 38)

# 游泳
train[train["swimDistance"] >=2000].index
Int64Index([ 177973,  274258, 1005337, 1195818, 1227362, 1889163, 2065940,
            2327586, 2784855, 3359439, 3513522, 4132225],
           dtype='int64')
train.drop(train[train["swimDistance"] >=20000].index, inplace=True)
train.shape

(4444918, 38)

异常值处理:武器收集异常值处理
train[train["weaponsAcquired"] >=80].index
Int64Index([ 233643,  588387, 1437471, 1449293, 1592744, 1834515, 2373240,
            2442962, 2743408, 2749693, 2797867, 2973445, 2977084, 2982525,
            3230315, 3405716, 3951710, 4022031, 4288517],
           dtype='int64')
train.drop(train[train["weaponsAcquired"] >=80].index, inplace=True)
train.shape

(4444899, 38)

异常值处理:删除使用治疗药品数量异常值
train[train["heals"] >=80].index

Int64Index([4262662], dtype=‘int64’)

train.drop(train[train["heals"] >=80].index, inplace=True)
train.shape

(4444898, 38)

类别型数据处理

比赛类型one-hot处理
train["matchType"].unique()
array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
       'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
       'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
       'crashtpp', 'normal-solo'], dtype=object)
train = pd.get_dummies(train, columns=["matchType"])
train.head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...matchType_normal-duomatchType_normal-duo-fppmatchType_normal-solomatchType_normal-solo-fppmatchType_normal-squadmatchType_normal-squad-fppmatchType_solomatchType_solo-fppmatchType_squadmatchType_squad-fpp
07f96b2f878858a4d4b580de459bea10357fd1a4a91000.0000060...0000000001
1eef90569b9d03c684d5656442f9eaeb375fc57110c0091.4700057...0000000001
21eaf90ac73de726a4a42c3245a74110163d8bb94ae1068.0000047...0000000000
34616d365dd2853a930a9c79cd721f1f1f4ef412d7e0032.9000075...0000000001
4315c96c26c9aacde04010b3458dd6dc8ff871e21e600100.0000045...0000000100

5 rows × 53 columns

matchType_encoding = train.filter(regex="matchType")
matchType_encoding.head()
matchType_crashfppmatchType_crashtppmatchType_duomatchType_duo-fppmatchType_flarefppmatchType_flaretppmatchType_normal-duomatchType_normal-duo-fppmatchType_normal-solomatchType_normal-solo-fppmatchType_normal-squadmatchType_normal-squad-fppmatchType_solomatchType_solo-fppmatchType_squadmatchType_squad-fpp
00000000000000001
10000000000000001
20010000000000000
30000000000000001
40000000000000100
对groupId,matchId等数据进行处理
train["groupId"].head()
0    4d4b580de459be
1    684d5656442f9e
2    6a4a42c3245a74
3    a930a9c79cd721
4    de04010b3458dd
Name: groupId, dtype: object
# train["groupId"].astype("category")
train["groupId"] = train["groupId"].astype("category")
train["groupId_cat"] = train["groupId"].cat.codes
train["groupId_cat"].head()
0     613619
1     827616
2     843307
3    1340122
4    1757411
Name: groupId_cat, dtype: int32
train["matchId"] = train["matchId"].astype("category")
train["matchId_cat"] = train["matchId"].cat.codes
train["matchId_cat"].head()
0    30085
1    32751
2     3143
3    45260
4    20531
Name: matchId_cat, dtype: int32
train.head()
IdgroupIdmatchIdassistsboostsdamageDealtDBNOsheadshotKillshealskillPlace...matchType_normal-solomatchType_normal-solo-fppmatchType_normal-squadmatchType_normal-squad-fppmatchType_solomatchType_solo-fppmatchType_squadmatchType_squad-fppgroupId_catmatchId_cat
07f96b2f878858a4d4b580de459bea10357fd1a4a91000.0000060...0000000161361930085
1eef90569b9d03c684d5656442f9eaeb375fc57110c0091.4700057...0000000182761632751
21eaf90ac73de726a4a42c3245a74110163d8bb94ae1068.0000047...000000008433073143
34616d365dd2853a930a9c79cd721f1f1f4ef412d7e0032.9000075...00000001134012245260
4315c96c26c9aacde04010b3458dd6dc8ff871e21e600100.0000045...00000100175741120531

5 rows × 55 columns

train.drop(["groupId", "matchId"], axis=1, inplace=True)

数据截取

取部分数据进行使用(100000)
df_sample = train.sample(100000)
df_sample.shape
(100000, 53)

确定特征值和目标值

df = df_sample.drop(["winPlacePerc", "Id"], axis=1)

y = df_sample["winPlacePerc"]                
df.shape

(100000, 51)

y.shape

(100000,)

分割训练集和测试集

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2)
X_train.shape

(80000, 51)

y_train.shape

(80000,)

机器学习(模型训练)和评估

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

使用随机森林对模型进行训练

初步使用随机森林进行模型训练
m1 = RandomForestRegressor(n_estimators=40, 
                           min_samples_leaf=3, 
                           max_features='sqrt',
                           n_jobs=-1)

m1.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=40, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
y_pre = m1.predict(X_valid)
m1.score(X_valid, y_valid)

0.907159951456783

mean_absolute_error(y_valid, y_pre)

0.06647584387091089

再次使用随机森林,进行模型训练
m1.feature_importances_
array([1.87658119e-03, 8.63746064e-02, 2.57685962e-02, 2.27532062e-03,
       8.80290888e-04, 2.81013428e-02, 2.35573150e-01, 2.07083462e-03,
       1.22714874e-02, 1.09702183e-02, 2.53353780e-02, 1.03767737e-02,
       6.77020063e-03, 7.45172312e-03, 4.28638708e-03, 3.28563034e-03,
       2.12925123e-02, 1.99967979e-05, 3.98400033e-03, 1.36372746e-04,
       1.32592980e-04, 1.71940999e-01, 4.20790087e-02, 2.52040557e-03,
       6.33213818e-03, 7.30402941e-03, 1.11477263e-02, 7.52171932e-03,
       1.19465432e-02, 5.30382552e-02, 1.81348675e-01, 0.00000000e+00,
       2.22747761e-03, 3.44612223e-05, 0.00000000e+00, 2.02507233e-04,
       5.97533128e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.54286394e-05, 0.00000000e+00, 4.94182811e-07, 0.00000000e+00,
       3.11300574e-04, 1.93840925e-04, 9.02659739e-04, 1.04955041e-03,
       9.87217660e-04, 4.53628891e-03, 4.50774311e-03])
imp_df = pd.DataFrame({"cols":df.columns, "imp":m1.feature_importances_})
imp_df.head()
colsimp
0assists0.001877
1boosts0.086375
2damageDealt0.025769
3DBNOs0.002275
4headshotKills0.000880
imp_df = imp_df.sort_values("imp", ascending=False)
imp_df.head()
colsimp
6killPlace0.235573
30totalDistance0.181349
21walkDistance0.171941
1boosts0.086375
29healsandboosts0.053038
imp_df[:20].plot("cols", "imp", figsize=(20, 8), kind="barh")

在这里插入图片描述

to_keep = imp_df[imp_df.imp > 0.005].cols
to_keep.shape

(20,)

df_keep = df[to_keep]
X_train, X_valid, y_train, y_valid = train_test_split(df_keep, y, test_size=0.2)
X_train.shape

(80000, 20)

m2 = RandomForestRegressor(n_estimators=40, 
                           min_samples_leaf=3, 
                           max_features='sqrt',
                           n_jobs=-1)

m2.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=40, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
y_pre = m2.predict(X_valid)
m2.score(X_valid, y_valid)

0.9125654968172906

mean_absolute_error(y_valid, y_pre)

0.06408683094647326

使用lightGBM对模型进行训练

X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2)
X_train.shape

(80000, 51)

模型初次尝试
import lightgbm as lgb
gbm = lgb.LGBMRegressor(objective="regression", num_leaves=31, learning_rate=0.05, n_estimators=20)

gbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
[1]	valid_0's l1: 0.255801	valid_0's l2: 0.0863836
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 0.244604	valid_0's l2: 0.0792314
[3]	valid_0's l1: 0.234038	valid_0's l2: 0.072761
[4]	valid_0's l1: 0.224123	valid_0's l2: 0.0669453
[5]	valid_0's l1: 0.214716	valid_0's l2: 0.0616647
[6]	valid_0's l1: 0.205802	valid_0's l2: 0.0568409
[7]	valid_0's l1: 0.197424	valid_0's l2: 0.0525102
[8]	valid_0's l1: 0.189497	valid_0's l2: 0.0485595
[9]	valid_0's l1: 0.18208	valid_0's l2: 0.0450087
[10]	valid_0's l1: 0.175038	valid_0's l2: 0.0417809
[11]	valid_0's l1: 0.168411	valid_0's l2: 0.0388494
[12]	valid_0's l1: 0.162014	valid_0's l2: 0.0361473
[13]	valid_0's l1: 0.156139	valid_0's l2: 0.0337388
[14]	valid_0's l1: 0.150548	valid_0's l2: 0.031546
[15]	valid_0's l1: 0.145259	valid_0's l2: 0.0295381
[16]	valid_0's l1: 0.140261	valid_0's l2: 0.0277049
[17]	valid_0's l1: 0.135596	valid_0's l2: 0.0260668
[18]	valid_0's l1: 0.131269	valid_0's l2: 0.0245903
[19]	valid_0's l1: 0.127159	valid_0's l2: 0.0232428
[20]	valid_0's l1: 0.123315	valid_0's l2: 0.0220185
Did not meet early stopping. Best iteration is:
[20]	valid_0's l1: 0.123315	valid_0's l2: 0.0220185



LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=20, n_jobs=-1, num_leaves=31, objective='regression',
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
y_pre = gbm.predict(X_valid, num_iteration=gbm.best_iteration_)
mean_absolute_error(y_valid, y_pre)

0.12331524150224461

模型二次调优
from sklearn.model_selection import GridSearchCV
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
    "learning_rate":[0.01, 0.1, 1],
    "n_estimators":[40, 60, 80, 100, 200, 300]
}

gbm = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1)

gbm.fit(X_train, y_train)
GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 1],
                         'n_estimators': [40, 60, 80, 100, 200, 300]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
y_pre = gbm.predict(X_valid)
mean_absolute_error(y_valid, y_pre)

0.05685004010605751

gbm.best_params_

{‘learning_rate’: 0.1, ‘n_estimators’: 300}

模型三次调优
# n_estimators

scores = []
n_estimators = [100, 300, 500, 800]

for nes in  n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type='gbdt', 
                      num_leaves=31,
                      max_depth=5,
                      learning_rate=0.1,
                      n_estimators=nes,
                      min_child_samples=20,
                      n_jobs=-1)
    
    lgbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
    
    y_pre = lgbm.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, y_pre)
    
    scores.append(mae)
    print("本次结果输出的mae值是:\n", mae)

本次结果输出的mae值是:
0.0566209902947507

plt.plot(n_estimators,scores,'o-')
plt.ylabel("mae")
plt.xlabel("n_estimator")
print("best n_estimator {}".format(n_estimators[np.argmin(scores)]))

best n_estimator 500

在这里插入图片描述

# max_depth

scores = []
max_depth = [3, 5, 7, 9, 11]

for md in  max_depth:
    lgbm = lgb.LGBMRegressor(boosting_type='gbdt', 
                      num_leaves=31,
                      max_depth=md,
                      learning_rate=0.1,
                      n_estimators=500,
                      min_child_samples=20,
                      n_jobs=-1)
    
    lgbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
    
    y_pre = lgbm.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, y_pre)
    
    scores.append(mae)
    print("本次结果输出的mae值是:\n", mae)

本次结果输出的mae值是:
0.0571923061736829

plt.plot(max_depth,scores,'o-')
plt.ylabel("mae")
plt.xlabel("max_depths")
print("best max_depths {}".format(max_depth[np.argmin(scores)]))

best max_depths 5

在这里插入图片描述

scores

[0.058867698663447106,
0.0566209902947507,
0.05695850296967709,
0.057414793402343275,
0.0571923061736829]

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

¥骁勇善战¥

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值