import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
train = pd.read_excel(r"C:\Users\Administrator\Desktop\jimsir\天池大赛\manufacture\data_first\train.xlsx")
# 数据概览
train.head()
ID | TOOL_ID | 210X1 | 210X2 | 210X3 | 210X4 | 210X5 | 210X6 | 210X7 | 210X8 | ... | 750X1444 | 750X1445 | 750X1446 | 750X1447 | 750X1448 | 750X1449 | 750X1450 | 750X1451 | 750X1452 | Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ID001 | N | 102.05 | 0.465 | 0.27 | 1.430 | 67.45 | 4.620 | -0.54 | -1.05 | ... | 0.00072 | 0.00072 | 25.7 | 0.00072 | 0.00072 | 25.7 | 0.00072 | 0.00072 | 2.400000e+12 | 2.945079 |
1 | ID002 | M | 100.95 | 0.805 | 0.22 | 3.477 | 62.08 | 3.412 | -2.12 | 1.02 | ... | 0.00072 | 0.00072 | 25.5 | 0.00072 | 0.00072 | 25.5 | 0.00072 | 0.00072 | 2.400000e+12 | 2.955092 |
2 | ID003 | L | 98.56 | 0.555 | 0.24 | 1.172 | 56.70 | 3.080 | -2.25 | 0.88 | ... | 0.00064 | 0.00064 | 25.2 | 0.00064 | 0.00064 | 25.2 | 0.00064 | 0.00064 | 2.400000e+12 | 2.741264 |
3 | ID004 | M | 100.35 | 0.901 | 0.22 | 3.631 | 62.25 | 3.949 | -1.98 | 0.82 | ... | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 2.400000e+12 | 2.799336 |
4 | ID005 | M | 100.25 | 0.854 | 0.23 | 3.429 | 61.42 | 3.630 | -1.89 | 1.02 | ... | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 2.400000e+12 | 2.692093 |
5 rows × 8029 columns
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Columns: 8029 entries, ID to Y
dtypes: float64(6279), int64(1739), object(11)
memory usage: 30.6+ MB
train.describe() # 可以观察空值的数量
210X1 | 210X2 | 210X3 | 210X4 | 210X5 | 210X6 | 210X7 | 210X8 | 210X9 | 210X10 | ... | 750X1444 | 750X1445 | 750X1446 | 750X1447 | 750X1448 | 750X1449 | 750X1450 | 750X1451 | 750X1452 | Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | ... | 486.000000 | 486.000000 | 486.000000 | 486.000000 | 486.000000 | 486.000000 | 486.000000 | 486.000000 | 4.860000e+02 | 500.000000 |
mean | 92.272420 | 0.511932 | 0.232420 | 1.626228 | 57.126160 | 3.424444 | -0.428580 | 0.267060 | -0.027540 | 25.854000 | ... | 0.000694 | 0.000694 | 26.010700 | 0.000694 | 0.000694 | 26.010700 | 0.000694 | 0.000694 | 2.400000e+12 | 2.846187 |
std | 27.471434 | 0.233265 | 0.104536 | 1.090267 | 18.224157 | 1.148460 | 1.188766 | 1.092823 | 0.120268 | 1.908069 | ... | 0.000045 | 0.000045 | 0.349445 | 0.000045 | 0.000045 | 0.349445 | 0.000045 | 0.000045 | 0.000000e+00 | 0.200970 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -2.450000 | -1.920000 | -0.220000 | 0.000000 | ... | 0.000540 | 0.000540 | 24.900000 | 0.000540 | 0.000540 | 24.900000 | 0.000540 | 0.000540 | 2.400000e+12 | 2.326846 |
25% | 100.100000 | 0.365750 | 0.210000 | 1.019750 | 56.575000 | 3.242000 | -1.912500 | -0.670000 | -0.140000 | 24.400000 | ... | 0.000670 | 0.000670 | 25.800000 | 0.000670 | 0.000670 | 25.800000 | 0.000670 | 0.000670 | 2.400000e+12 | 2.705686 |
50% | 101.300000 | 0.506000 | 0.240000 | 1.293000 | 62.130000 | 3.616000 | -0.185000 | 0.400000 | -0.015000 | 26.000000 | ... | 0.000690 | 0.000690 | 26.000000 | 0.000690 | 0.000690 | 26.000000 | 0.000690 | 0.000690 | 2.400000e+12 | 2.840442 |
75% | 101.750000 | 0.612250 | 0.270000 | 1.925750 | 67.250000 | 3.864250 | 0.285000 | 1.220000 | 0.080000 | 27.000000 | ... | 0.000730 | 0.000730 | 26.300000 | 0.000730 | 0.000730 | 26.300000 | 0.000730 | 0.000730 | 2.400000e+12 | 2.980399 |
max | 102.800000 | 0.970000 | 0.410000 | 3.891000 | 71.980000 | 5.173000 | 2.190000 | 2.230000 | 0.170000 | 29.400000 | ... | 0.000880 | 0.000880 | 27.100000 | 0.000880 | 0.000880 | 27.100000 | 0.000880 | 0.000880 | 2.400000e+12 | 3.454556 |
8 rows × 8018 columns
train_describe = train.describe()
np.sum(train_describe.loc['count'] != 500)
2238
sns.distplot(train_describe.loc['count'])
<matplotlib.axes._subplots.AxesSubplot at 0xf464088>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-s7qxMm7P-1571391706490)(output_6_1.png)]
train_describe.loc['count'] == 0
210X1 False
210X2 False
210X3 False
210X4 False
210X5 False
...
750X1449 False
750X1450 False
750X1451 False
750X1452 False
Y False
Name: count, Length: 8018, dtype: bool
print('all missing',np.sum(train_describe.loc['count'] == 0),'\nSeriously missing',np.sum(((train_describe.loc['count'] <=250)&(train_describe.loc['count'] > 0))))
all missing 62
Seriously missing 136
print(np.sum(((train_describe.loc['count'] == 222))))
135
train_cols_allnull = train_describe.columns[train_describe.loc['count'] == 0]
train_cols_allnull
Index(['220X150', '220X151', '340X12', '340X14', '340X16', '340X32', '340X50',
'340X52', '340X54', '340X56', '340X58', '340X60', '340X62', '340X64',
'340X66', '340X68', '340X70', '340X72', '340X74', '340X76', '340X78',
'340X80', '340X82', '340X84', '340X86', '340X88', '340X90', '340X92',
'340X94', '340X96', '340X98', '340X100', '340X102', '340X104',
'340X106', '340X112', '340X114', '340X116', '340X118', '340X120',
'340X122', '340X131', '340X142', '340X144', '340X146', '340X148',
'340X150', '340X152', '340X154', '340X156', '340X158', '340X160',
'340X162', '340X164', '340X166', '340X168', '340X170', '340X172',
'340X174', '340X176', '340X178', '340X180'],
dtype='object')
list(set(train.columns) - set(train_cols_allnull))
['330X1069',
'330X393',
'750X1150',
'312X707',
'344X184',
'360X643',
'400X217',
'750X290',
'360X185',
'360X1257',
'220X376',
......
'750X1261',
'330X496',
'311X160',
'750X616',
'261X656',
'312X531',
'440AX33',
'750X907',
...]
train_step2 = train[list(set(train.columns) - set(train_cols_allnull))] # 非全空的列
tmp = train_describe.columns[train_describe.loc['count'] == 222]
tmp
Index(['312X5', '312X10', '312X15', '312X20', '312X25', '312X30', '312X35',
'312X40', '312X45', '312X50',
...
'312X744', '312X750', '312X756', '312X762', '312X768', '312X774',
'312X780', '312X786', '312X792', '312X798'],
dtype='object', length=135)
null_222 = train_step2[tmp]
null_222
312X5 | 312X10 | 312X15 | 312X20 | 312X25 | 312X30 | 312X35 | 312X40 | 312X45 | 312X50 | ... | 312X744 | 312X750 | 312X756 | 312X762 | 312X768 | 312X774 | 312X780 | 312X786 | 312X792 | 312X798 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
495 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
496 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 107.7 | 110.0 | 25.0 | 110.0 | 110.1 | 110.0 | 101.5 | 101.6 | 101.5 | 110.0 |
497 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 107.3 | 110.0 | 25.0 | 109.9 | 110.0 | 109.9 | 101.9 | 101.9 | 101.7 | 110.0 |
498 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 105.4 | 110.0 | 25.0 | 108.8 | 108.9 | 108.8 | 99.9 | 100.0 | 99.8 | 110.0 |
499 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 106.1 | 110.0 | 25.0 | 110.1 | 110.1 | 110.0 | 101.8 | 101.9 | 101.7 | 110.0 |
500 rows × 135 columns
np.sum(np.sum(null_222[null_222['312X5']>0].isnull()))
0
null_222[null_222['312X5']>0]
312X5 | 312X10 | 312X15 | 312X20 | 312X25 | 312X30 | 312X35 | 312X40 | 312X45 | 312X50 | ... | 312X744 | 312X750 | 312X756 | 312X762 | 312X768 | 312X774 | 312X780 | 312X786 | 312X792 | 312X798 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 105.9 | 110.0 | 25.0 | 110.1 | 110.2 | 110.0 | 100.4 | 100.4 | 100.3 | 110.0 |
8 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 107.5 | 110.0 | 25.0 | 110.1 | 110.2 | 110.0 | 100.6 | 100.6 | 100.5 | 110.0 |
9 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 105.9 | 110.0 | 25.0 | 110.1 | 110.1 | 110.0 | 101.6 | 101.7 | 101.6 | 110.0 |
10 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 106.0 | 110.0 | 25.0 | 110.1 | 110.1 | 110.0 | 101.4 | 101.7 | 101.2 | 110.0 |
24 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 104.4 | 110.0 | 25.0 | 110.0 | 110.1 | 109.9 | 98.7 | 98.9 | 98.7 | 110.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
490 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 104.3 | 110.0 | 25.0 | 106.4 | 106.5 | 106.3 | 99.9 | 99.9 | 99.8 | 110.0 |
496 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 107.7 | 110.0 | 25.0 | 110.0 | 110.1 | 110.0 | 101.5 | 101.6 | 101.5 | 110.0 |
497 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 107.3 | 110.0 | 25.0 | 109.9 | 110.0 | 109.9 | 101.9 | 101.9 | 101.7 | 110.0 |
498 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 105.4 | 110.0 | 25.0 | 108.8 | 108.9 | 108.8 | 99.9 | 100.0 | 99.8 | 110.0 |
499 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | ... | 106.1 | 110.0 | 25.0 | 110.1 | 110.1 | 110.0 | 101.8 | 101.9 | 101.7 | 110.0 |
222 rows × 135 columns
train_step2_null_222 = train_step2.loc[null_222[null_222['312X5']>0].index]
train_step2.shape
(500, 7967)
train_step2_null_222.shape
(222, 7967)
tools = []
for col in train_step2.columns:
if 'T' in col:
tools.append(col)
tools
['TOOL (#2)',
'Tool (#1)',
'TOOL_ID (#3)',
'TOOL_ID (#2)',
'Tool',
'Tool (#2)',
'TOOL',
'TOOL (#1)',
'TOOL_ID',
'TOOL_ID (#1)',
'Tool (#3)']
for tool in tools:
print('For tool', tool, train_step2[tool].nunique(),train_step2_null_222[tool].nunique())
For tool TOOL (#2) 2 2
For tool Tool (#1) 8 6
For tool TOOL_ID (#3) 2 2
For tool TOOL_ID (#2) 3 3
For tool Tool 2 2
For tool Tool (#2) 3 3
For tool TOOL 3 3
For tool TOOL (#1) 2 1
For tool TOOL_ID 6 6
For tool TOOL_ID (#1) 2 2
For tool Tool (#3) 10 10
train_step2_null_222['Tool (#1)'].unique()
array([ 530, 1113, 1018, 1110, 1245, 2823], dtype=int64)
train_step2['Tool (#1)'].unique()
array([ 329, 215, 530, 1113, 1018, 1110, 1245, 2823], dtype=int64)
np.sum(train_step2['Tool (#1)'] == 215)
141
np.sum(train_step2['Tool (#1)'] == 329)
137
500-278
222
test_A=pd.read_excel(r"C:\Users\Administrator\Desktop\jimsir\天池大赛\manufacture\data_first\testA.xlsx")
test_A['Tool (#1)'].unique()
array([2823, 215, 329, 1110, 530, 1113, 1245, 206, 1018], dtype=int64)
test_A[((test_A['Tool (#1)'] == 215) | (test_A['Tool (#1)'] == 329) |(test_A['Tool (#1)'] == 206) )].shape
(35, 8028)
np.sum(np.sum(test_A[(test_A['Tool (#1)'] == 215) | (test_A['Tool (#1)'] == 329) |(test_A['Tool (#1)'] == 206)][tmp].isnull()))
4725
4275/35
122.14285714285714
np.sum(np.sum(test_A[(test_A['Tool (#1)'] != 215) & (test_A['Tool (#1)'] != 329) &(test_A['Tool (#1)'] != 206)][tmp].isnull()))
0
sns.distplot(train_step2[train_step2['Tool (#1)'] == 215]['Y'])
<matplotlib.axes._subplots.AxesSubplot at 0xf464dc8>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-a5L44uzJ-1571391706493)(output_34_1.png)]
sns.distplot(train_step2[train_step2['Tool (#1)'] == 329]['Y'])
<matplotlib.axes._subplots.AxesSubplot at 0xf916688>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YEQvZCnt-1571391706494)(output_35_1.png)]
sns.distplot(train_step2[((train_step2['Tool (#1)'] != 329) & (train_step2['Tool (#1)'] != 215))]['Y'])
<matplotlib.axes._subplots.AxesSubplot at 0x165d83c8>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-zhI7VJSm-1571391706494)(output_36_1.png)]
from sklearn.metrics import mean_squared_error
train['Y'].median()
2.8404417463190392
pred = train_step2['Y'].copy()
a=train_step2[train_step2['Tool (#1)'] == 329]
b = train_step2[train_step2['Tool (#1)'] == 215]
c = train_step2[((train_step2['Tool (#1)'] != 215)&(train_step2['Tool (#1)'] != 329))]
pred.loc[a.index] = a['Y'].median()
pred.loc[b.index] = b['Y'].median()
pred.loc[c.index] = c['Y'].median()
pred.value_counts()
2.779034 222
2.891122 141
2.882981 137
Name: Y, dtype: int64
mean_squared_error(train_step2['Y'], pred)
0.03848814140862121
mean_squared_error(y_pred=pred,y_true=train_step2['Y'])
0.03848814140862121
train_step2['Tool (#1)'].value_counts()
215 141
329 137
1110 59
2823 55
1245 46
1018 23
1113 22
530 17
Name: Tool (#1), dtype: int64
pred= train_step2['Y'].copy()
for val in train_step2['Tool (#1)'].unique():
print(val, train_step2[train_step2['Tool (#1)'] == val].shape[0])
pred.loc[train_step2[train_step2['Tool (#1)'] == val].index] = train_step2[train_step2['Tool (#1)'] == val]['Y'].mean()
329 137
215 141
530 17
1113 22
1018 23
1110 59
1245 46
2823 55
mean_squared_error(y_pred=pred, y_true=train_step2['Y'])
0.03805754994694719
np.sum(train_describe.loc['min'] == train_describe.loc['max'])
1041
train_cols_allsame = train_describe.columns[train_describe.loc['min'] == train_describe.loc['max']]
train_cols_allsame
Index(['210X17', '210X31', '210X38', '210X79', '210X103', '210X196', '210X197',
'210X198', '210X227', '220X1',
...
'750X1399', '750X1400', '750X1401', '750X1403', '750X1404', '750X1406',
'750X1407', '750X1408', '750X1442', '750X1452'],
dtype='object', length=1041)
test_A_describe = test_A.describe()
np.sum(test_A_describe.loc['min'] == test_A_describe.loc['max'])
1080
test_A_cols_allsame = test_A_describe.columns[test_A_describe.loc['min'] == test_A_describe.loc['max']]
train_minus_test = set(train_cols_allsame) - set(test_A_cols_allsame)
both_allsame = set(train_cols_allsame) - set(train_minus_test)
len(train_cols_allsame)
1041
len(test_A_cols_allsame)
1080
len(train_minus_test)
22
len(both_allsame)
1019
len(set(train_step2.columns))
7967
cols = set(train_step2.columns) - both_allsame
train_step3 = train_step2[list(cols)]
tools
['TOOL (#2)',
'Tool (#1)',
'TOOL_ID (#3)',
'TOOL_ID (#2)',
'Tool',
'Tool (#2)',
'TOOL',
'TOOL (#1)',
'TOOL_ID',
'TOOL_ID (#1)',
'Tool (#3)']
train_tool = train[tools]
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
a_list = ['a'] * 2 + ['b']*3 + ['c','b']
print(a_list)
labelEncoder.fit_transform(a_list)
['a', 'a', 'b', 'b', 'b', 'c', 'b']
array([0, 0, 1, 1, 1, 2, 1], dtype=int64)
from sklearn.preprocessing import LabelEncoder
for tool in tools:
lb = LabelEncoder()
lb.fit(train_tool[tool])
train_tool.loc[:,tool] = lb.transform(train_tool[tool])
D:\Anaconda3\envs\python3\lib\site-packages\pandas\core\indexing.py:494: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[item] = s
D:\Anaconda3\envs\python3\lib\site-packages\pandas\core\indexing.py:576: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[item_labels[indexer[info_axis]]] = value
train.shape
(500, 8029)
train_describe.shape
(8, 8018)
#结合交叉验证,构建随机森林模型
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
rf = RandomForestRegressor(n_estimators=200,min_samples_leaf=6,min_samples_split=6)
val = model_selection.cross_val_score(rf,train_tool,train['Y'].values,cv=10,scoring='neg_mean_squared_error')
# 评估
print(val)
print(np.mean(val))
[-0.03544356 -0.02205482 -0.03145397 -0.05643132 -0.02725092 -0.04143731
-0.02598229 -0.03891663 -0.03350271 -0.03271177]
-0.03451853149088352
train_tool_dummy = train_tool.copy()
for tool in tools:
train_tool_dummy = pd.get_dummies(data=train_tool_dummy, columns=[tool])
val = model_selection.cross_val_score(rf, train_tool_dummy,train['Y'].values,cv=10,scoring='neg_mean_squared_error')
np.mean(val)
-0.033536821368123956
# 利用train_test_split重新训练模型
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(train_tool_dummy,train['Y'].values, test_size=0.3)
rf=RandomForestRegressor(n_estimators=200,min_samples_leaf=6,min_samples_split=6)
rf.fit(train_X, train_Y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=6, min_samples_split=6,
min_weight_fraction_leaf=0.0, n_estimators=200,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
#评估
pred = rf.predict(test_X)
mean_squared_error(y_pred=pred,y_true=test_Y)
0.03561266686805807
compare = pd.DataFrame({'y_pred':pred,'y_true':test_Y})
compare['diff'] = compare['y_true'] - compare['y_pred']
compare.sort_values('diff')
y_pred | y_true | diff | |
---|---|---|---|
53 | 2.941684 | 2.485652 | -0.456033 |
139 | 2.984927 | 2.569413 | -0.415514 |
145 | 2.728678 | 2.350377 | -0.378301 |
84 | 2.860819 | 2.491770 | -0.369049 |
121 | 2.757900 | 2.397770 | -0.360130 |
... | ... | ... | ... |
113 | 2.836899 | 3.230493 | 0.393593 |
45 | 2.828529 | 3.232169 | 0.403640 |
120 | 2.902924 | 3.321433 | 0.418509 |
20 | 2.787306 | 3.270821 | 0.483515 |
125 | 2.745893 | 3.238402 | 0.492509 |
150 rows × 3 columns
train_X, test_X, train_Y, test_Y = train_test_split(train_tool_dummy,train['Y'].values, test_size=0.3)
rf=RandomForestRegressor(n_estimators=200,min_samples_leaf=6,min_samples_split=6)
rf.fit(train_X, train_Y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=6, min_samples_split=6,
min_weight_fraction_leaf=0.0, n_estimators=200,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
for i in range(len(train_tool.columns)):
print(train_tool.columns[i],rf.feature_importances_[i])
TOOL (#2) 0.0
Tool (#1) 0.0
TOOL_ID (#3) 0.04977919950874401
TOOL_ID (#2) 0.034050966856127914
Tool 0.0006262455932305422
Tool (#2) 0.0
TOOL 0.00987126347542283
TOOL (#1) 0.002039560967227642
TOOL_ID 0.008330845725601452
TOOL_ID (#1) 0.01961358235658726
Tool (#3) 0.014775338277854095
train_corr = train_step3.corr()
strong_linear_fea=[]
for col in train_corr.columns:
if np.abs(train_corr.loc['Y',col]) >=0.1 and col != 'Y':
strong_linear_fea.append(col)
strong_linear_fea
['330X1069',
'330X393',
'750X1150',
'360X185',
'261X352',
'330X300',
'520X260',
'310X176',
'750X197',
...
'360X726',
'420X90',
'750X153',
'360X1059',
'330X557',
'261X241',
'330X1072',
'210X152',
'750X1389',
'330X853',
'750X164',
'330X599',
'750X869',
...]
train_tool_linear = train_tool_dummy.copy()
for col in strong_linear_fea:
train_tool_linear[col] = train_step3[col]
train_tool_linear.fillna(-100,inplace=True)
rf=RandomForestRegressor(n_estimators=200,min_samples_leaf=5,min_samples_split=5)
val = model_selection.cross_val_score(rf, train_tool_dummy,train['Y'].values,cv=10,scoring='neg_mean_squared_error')
np.mean(val)
-0.0335044546686339
from sklearn.svm import SVR
train_tool_linear = train_tool_dummy.copy()
for col in strong_linear_fea:
if np.max(train_step3[col]) == np.min(train_step3[col]):
continue
train_tool_linear[col] = (train_step3[col] - np.min(train_step3[col]))/(np.max(train_step3[col])-np.min(train_step3[col]))
sv = SVR(C=2.5)
train_tool_linear.fillna(-1,inplace=True)
val = model_selection.cross_val_score(sv,train_tool_linear,train['Y'].values,cv=10,scoring='neg_mean_squared_error')
#评估
np.mean(val)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
D:\Anaconda3\envs\python3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
-0.030984216663715352