## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
import csv
warnings.filterwarnings('ignore')
%matplotlib inline
## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA
import lightgbm as lgb
import xgboost as xgb
## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
def read_file(path,count=None):
'''
定义读取文件的函数
path: 文件路径
count: 读取行数
'''
# 1
# list = []
# with open(path,"r",encoding='UTF-8') as f: #以只读的方式打开文件
# read_scv=csv.reader(f) #调用csv的reader方法读取文件并赋值给read_scv变量
# for i,line in enumerate(read_scv):
# if i == count:
# break
# list.append(line) #将读取到的数据追加到list列表里面
# list = pd.DataFrame(list)
# list.columns=['id','heartbeat_signals','label']
# return list[1:] #返回列表数据
# 2
return pd.read_csv(path,sep=' ',nrows = count)
Train_data = read_file('used_car_train_20200313.csv')
TestA_data = read_file('used_car_testA_20200313.csv')
数据字段
- SaleID 交易ID,唯一编码
- name 汽车交易名称,已脱敏
- regDate 汽车注册日期,例如20160101,2016年01月01日
- model 车型编码,已脱敏
- brand 汽车品牌,已脱敏
- bodyType 车身类型:豪华轿车:0,微型车:1,厢型车:2,大巴车:3,敞篷车:4,双门汽车:5,商务车:6,搅拌车:7
- fuelType 燃油类型:汽油:0,柴油:1,液化石油气:2,天然气:3,混合动力:4,其他:5,电动:6
- gearbox 变速箱:手动:0,自动:1
- power 发动机功率:范围 [ 0, 600 ]
- kilometer 汽车已行驶公里,单位万km
- notRepairedDamage 汽车有尚未修复的损坏:是:0,否:1
- regionCode 地区编码,已脱敏
- seller 销售方:个体:0,非个体:1
- offerType 报价类型:提供:0,请求:1
- creatDate 汽车上线时间,即开始售卖时间
- price 二手车交易价格(预测目标)
- v_0’, ‘v_1’, ‘v_2’, ‘v_3’, ‘v_4’, ‘v_5’, ‘v_6’, ‘v_7’, ‘v_8’, ‘v_9’, ‘v_10’, ‘v_11’, ‘v_12’, ‘v_13’,‘v_14’(根据汽车的评论、标签等大量信息得到的embedding向量)匿名特征,包含v0-14在内15个匿名特征
数据观察
# 数据纵览
Train_data.head().append(Train_data.tail())
# TestA_data.head().append(TestA_data.tail())
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 736 | 20040402 | 30.0 | 6 | 1.0 | 0.0 | 0.0 | 60 | 12.5 | ... | 0.235676 | 0.101988 | 0.129549 | 0.022816 | 0.097462 | -2.881803 | 2.804097 | -2.420821 | 0.795292 | 0.914762 |
1 | 1 | 2262 | 20030301 | 40.0 | 1 | 2.0 | 0.0 | 0.0 | 0 | 15.0 | ... | 0.264777 | 0.121004 | 0.135731 | 0.026597 | 0.020582 | -4.900482 | 2.096338 | -1.030483 | -1.722674 | 0.245522 |
2 | 2 | 14874 | 20040403 | 115.0 | 15 | 1.0 | 0.0 | 0.0 | 163 | 12.5 | ... | 0.251410 | 0.114912 | 0.165147 | 0.062173 | 0.027075 | -4.846749 | 1.803559 | 1.565330 | -0.832687 | -0.229963 |
3 | 3 | 71865 | 19960908 | 109.0 | 10 | 0.0 | 0.0 | 1.0 | 193 | 15.0 | ... | 0.274293 | 0.110300 | 0.121964 | 0.033395 | 0.000000 | -4.509599 | 1.285940 | -0.501868 | -2.438353 | -0.478699 |
4 | 4 | 111080 | 20120103 | 110.0 | 5 | 1.0 | 0.0 | 0.0 | 68 | 5.0 | ... | 0.228036 | 0.073205 | 0.091880 | 0.078819 | 0.121534 | -1.896240 | 0.910783 | 0.931110 | 2.834518 | 1.923482 |
149995 | 149995 | 163978 | 20000607 | 121.0 | 10 | 4.0 | 0.0 | 1.0 | 163 | 15.0 | ... | 0.280264 | 0.000310 | 0.048441 | 0.071158 | 0.019174 | 1.988114 | -2.983973 | 0.589167 | -1.304370 | -0.302592 |
149996 | 149996 | 184535 | 20091102 | 116.0 | 11 | 0.0 | 0.0 | 0.0 | 125 | 10.0 | ... | 0.253217 | 0.000777 | 0.084079 | 0.099681 | 0.079371 | 1.839166 | -2.774615 | 2.553994 | 0.924196 | -0.272160 |
149997 | 149997 | 147587 | 20101003 | 60.0 | 11 | 1.0 | 1.0 | 0.0 | 90 | 6.0 | ... | 0.233353 | 0.000705 | 0.118872 | 0.100118 | 0.097914 | 2.439812 | -1.630677 | 2.290197 | 1.891922 | 0.414931 |
149998 | 149998 | 45907 | 20060312 | 34.0 | 10 | 3.0 | 1.0 | 0.0 | 156 | 15.0 | ... | 0.256369 | 0.000252 | 0.081479 | 0.083558 | 0.081498 | 2.075380 | -2.633719 | 1.414937 | 0.431981 | -1.659014 |
149999 | 149999 | 177672 | 19990204 | 19.0 | 28 | 6.0 | 0.0 | 1.0 | 193 | 12.5 | ... | 0.284475 | 0.000000 | 0.040072 | 0.062543 | 0.025819 | 1.978453 | -3.179913 | 0.031724 | -1.483350 | -0.342674 |
10 rows × 31 columns
#数据行列信息
print('Train data shape:',Train_data.shape)
print('Test data shape:',TestA_data.shape)
Train data shape: (150000, 31)
Test data shape: (50000, 30)
# 数据信息查看
# 通过info来了解数据每列的type
# Train_data.info()
TestA_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SaleID 50000 non-null int64
1 name 50000 non-null int64
2 regDate 50000 non-null int64
3 model 50000 non-null float64
4 brand 50000 non-null int64
5 bodyType 48587 non-null float64
6 fuelType 47107 non-null float64
7 gearbox 48090 non-null float64
8 power 50000 non-null int64
9 kilometer 50000 non-null float64
10 notRepairedDamage 50000 non-null object
11 regionCode 50000 non-null int64
12 seller 50000 non-null int64
13 offerType 50000 non-null int64
14 creatDate 50000 non-null int64
15 v_0 50000 non-null float64
16 v_1 50000 non-null float64
17 v_2 50000 non-null float64
18 v_3 50000 non-null float64
19 v_4 50000 non-null float64
20 v_5 50000 non-null float64
21 v_6 50000 non-null float64
22 v_7 50000 non-null float64
23 v_8 50000 non-null float64
24 v_9 50000 non-null float64
25 v_10 50000 non-null float64
26 v_11 50000 non-null float64
27 v_12 50000 non-null float64
28 v_13 50000 non-null float64
29 v_14 50000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
EDA数据探索性分析
# import pandas_profiling
# Train_data_report = pandas_profiling.ProfileReport(Train_data.sample(n=1000))
Train_data.sample(n=1000)
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
133959 | 133959 | 14152 | 20070408 | 63.0 | 0 | 2.0 | 1.0 | 0.0 | 140 | 15.0 | ... | 0.247791 | 0.000411 | 0.131053 | 0.074492 | 0.087644 | 2.397437 | -2.013465 | 1.082190 | 0.337956 | -0.498149 |
36804 | 36804 | 13004 | 19990911 | 7.0 | 5 | 0.0 | 0.0 | 0.0 | 75 | 15.0 | ... | 0.257088 | 0.082413 | 0.019284 | 0.028461 | 0.072921 | -1.993126 | 1.307589 | -2.933913 | 0.388731 | -0.581585 |
43852 | 43852 | 138236 | 20030301 | 55.0 | 17 | 0.0 | 1.0 | 0.0 | 116 | 15.0 | ... | 0.246199 | 0.000014 | 0.003112 | 0.062542 | 0.115071 | 3.031704 | -1.451162 | -1.601463 | 1.974878 | -1.162737 |
138366 | 138366 | 98885 | 19990002 | 30.0 | 6 | 1.0 | 0.0 | NaN | 0 | 15.0 | ... | 0.239293 | 0.000000 | 0.117356 | 0.029686 | 0.083381 | 3.962004 | 0.064650 | -3.085657 | 0.204415 | 0.369803 |
136751 | 136751 | 152323 | 20100611 | 65.0 | 1 | 4.0 | 1.0 | 0.0 | 140 | 12.5 | ... | 0.274975 | 0.000545 | 0.053238 | 0.106818 | 0.031934 | 1.285200 | -3.766570 | 3.279240 | -0.206268 | 0.219337 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
54189 | 54189 | 26287 | 19990312 | 8.0 | 0 | 2.0 | 0.0 | 0.0 | 150 | 15.0 | ... | 0.263254 | 0.095075 | 0.102901 | 0.041104 | 0.029092 | -3.339819 | 1.166773 | -0.642460 | -1.065173 | 0.311333 |
15092 | 15092 | 3011 | 20010407 | 65.0 | 1 | 0.0 | 0.0 | 0.0 | 102 | 15.0 | ... | 0.269231 | 0.116930 | 0.086352 | 0.037756 | 0.018960 | -4.862294 | 1.569527 | -0.709982 | -1.380195 | -0.696717 |
9229 | 9229 | 162281 | 20061208 | 0.0 | 0 | 0.0 | 1.0 | 0.0 | 105 | 15.0 | ... | 0.260780 | 0.000459 | 0.096313 | 0.083045 | 0.048458 | 2.146483 | -2.425364 | 1.503840 | -0.096753 | 1.015214 |
147647 | 147647 | 6128 | 19991105 | 44.0 | 0 | 3.0 | 1.0 | 0.0 | 102 | 15.0 | ... | 0.271919 | 0.110720 | 0.093585 | 0.027404 | 0.023803 | -4.414984 | 1.352092 | -1.310048 | -1.758741 | -0.614666 |
39775 | 39775 | 806 | 20000603 | 77.0 | 0 | 3.0 | 1.0 | 0.0 | 116 | 15.0 | ... | 0.265784 | 0.110579 | 0.084767 | 0.032677 | 0.049479 | -4.450537 | 1.296043 | -1.124556 | -0.882583 | -0.355871 |
1000 rows × 31 columns
# Train_data_report.to_file('Train_data_report.html')
# 通过 .columns 查看列名
# Train_data.columns
TestA_data.columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
'seller', 'offerType', 'creatDate', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4',
'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13',
'v_14'],
dtype='object')
# 通过 .describe() 可以查看数值特征列的一些统计信息
# 个数count、平均值mean、方差std、最小值min、中位数25% 50% 75% 、以及最大值
# Train_data.describe()
TestA_data.describe()
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000.000000 | 50000.000000 | 5.000000e+04 | 50000.000000 | 50000.000000 | 48587.000000 | 47107.000000 | 48090.000000 | 50000.000000 | 50000.000000 | ... | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 |
mean | 174999.500000 | 68542.223280 | 2.003393e+07 | 46.844520 | 8.056240 | 1.782185 | 0.373405 | 0.224350 | 119.883620 | 12.595580 | ... | 0.248669 | 0.045021 | 0.122744 | 0.057997 | 0.062000 | -0.017855 | -0.013742 | -0.013554 | -0.003147 | 0.001516 |
std | 14433.901067 | 61052.808133 | 5.368870e+04 | 49.469548 | 7.819477 | 1.760736 | 0.546442 | 0.417158 | 185.097387 | 3.908979 | ... | 0.044601 | 0.051766 | 0.195972 | 0.029211 | 0.035653 | 3.747985 | 3.231258 | 2.515962 | 1.286597 | 1.027360 |
min | 150000.000000 | 0.000000 | 1.991000e+07 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.500000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -9.160049 | -5.411964 | -8.916949 | -4.123333 | -6.112667 |
25% | 162499.750000 | 11203.500000 | 1.999091e+07 | 10.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 75.000000 | 12.500000 | ... | 0.243762 | 0.000044 | 0.062644 | 0.035084 | 0.033714 | -3.700121 | -1.971325 | -1.876703 | -1.060428 | -0.437920 |
50% | 174999.500000 | 52248.500000 | 2.003091e+07 | 29.000000 | 6.000000 | 1.000000 | 0.000000 | 0.000000 | 109.000000 | 15.000000 | ... | 0.257877 | 0.000815 | 0.095828 | 0.057084 | 0.058764 | 1.613212 | -0.355843 | -0.142779 | -0.035956 | 0.138799 |
75% | 187499.250000 | 118856.500000 | 2.007110e+07 | 65.000000 | 13.000000 | 3.000000 | 1.000000 | 0.000000 | 150.000000 | 15.000000 | ... | 0.265328 | 0.102025 | 0.125438 | 0.079077 | 0.087489 | 2.832708 | 1.262914 | 1.764335 | 0.941469 | 0.681163 |
max | 199999.000000 | 196805.000000 | 2.015121e+07 | 246.000000 | 39.000000 | 7.000000 | 6.000000 | 1.000000 | 20000.000000 | 15.000000 | ... | 0.291618 | 0.153265 | 1.358813 | 0.156355 | 0.214775 | 12.338872 | 18.856218 | 12.950498 | 5.913273 | 2.624622 |
8 rows × 29 columns
# 提取数值类型特征列名
numerical_cols = Train_data.select_dtypes(exclude = 'object').columns
numerical_cols
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'power', 'kilometer', 'regionCode', 'seller', 'offerType',
'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14'],
dtype='object')
categorical_cols = Train_data.select_dtypes(include = 'object').columns
categorical_cols
Index(['notRepairedDamage'], dtype='object')
Train_data.head()
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 736 | 20040402 | 30.0 | 6 | 1.0 | 0.0 | 0.0 | 60 | 12.5 | ... | 0.235676 | 0.101988 | 0.129549 | 0.022816 | 0.097462 | -2.881803 | 2.804097 | -2.420821 | 0.795292 | 0.914762 |
1 | 1 | 2262 | 20030301 | 40.0 | 1 | 2.0 | 0.0 | 0.0 | 0 | 15.0 | ... | 0.264777 | 0.121004 | 0.135731 | 0.026597 | 0.020582 | -4.900482 | 2.096338 | -1.030483 | -1.722674 | 0.245522 |
2 | 2 | 14874 | 20040403 | 115.0 | 15 | 1.0 | 0.0 | 0.0 | 163 | 12.5 | ... | 0.251410 | 0.114912 | 0.165147 | 0.062173 | 0.027075 | -4.846749 | 1.803559 | 1.565330 | -0.832687 | -0.229963 |
3 | 3 | 71865 | 19960908 | 109.0 | 10 | 0.0 | 0.0 | 1.0 | 193 | 15.0 | ... | 0.274293 | 0.110300 | 0.121964 | 0.033395 | 0.000000 | -4.509599 | 1.285940 | -0.501868 | -2.438353 | -0.478699 |
4 | 4 | 111080 | 20120103 | 110.0 | 5 | 1.0 | 0.0 | 0.0 | 68 | 5.0 | ... | 0.228036 | 0.073205 | 0.091880 | 0.078819 | 0.121534 | -1.896240 | 0.910783 | 0.931110 | 2.834518 | 1.923482 |
5 rows × 31 columns
# 选择特征列
feature_cols = [col for col in numerical_cols if col not in ['SaleID','name','price']]
# feature_cols = [col for col in feature_cols if 'Type' not in col]
feature_cols
['regDate',
'model',
'brand',
'bodyType',
'fuelType',
'gearbox',
'power',
'kilometer',
'regionCode',
'seller',
'offerType',
'creatDate',
'v_0',
'v_1',
'v_2',
'v_3',
'v_4',
'v_5',
'v_6',
'v_7',
'v_8',
'v_9',
'v_10',
'v_11',
'v_12',
'v_13',
'v_14']
x_Train_data = Train_data[feature_cols]
# 一个异常值处理的代码
def outliers_proc(data, col_name, scale=3):
"""
用于清洗异常值,默认用 box_plot(scale=3)进行清洗
:param data: 接收 pandas 数据格式
:param col_name: pandas 列名
:param scale: 尺度
:return:
"""
def box_plot_outliers(data_ser, box_scale):
"""
利用箱线图去除异常值
:param data_ser: 接收 pandas.Series 数据格式
:param box_scale: 箱线图尺度,
:return:
"""
iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25))
val_low = data_ser.quantile(0.25) - iqr
val_up = data_ser.quantile(0.75) + iqr
rule_low = (data_ser < val_low)
rule_up = (data_ser > val_up)
return (rule_low, rule_up), (val_low, val_up)
data_n = data.copy()
data_series = data_n[col_name]
rule, value = box_plot_outliers(data_series, box_scale=scale)
index = np.arange(data_series.shape[0])[rule[0] | rule[1]]
print("Delete number is: {}".format(len(index)))
data_n = data_n.drop(index)
data_n.reset_index(drop=True, inplace=True)
print("Now column number is: {}".format(data_n.shape[0]))
index_low = np.arange(data_series.shape[0])[rule[0]]
outliers = data_series.iloc[index_low]
print("Description of data less than the lower bound is:")
print(pd.Series(outliers).describe())
index_up = np.arange(data_series.shape[0])[rule[1]]
outliers = data_series.iloc[index_up]
print("Description of data larger than the upper bound is:")
print(pd.Series(outliers).describe())
fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.boxplot(y=data[col_name], data=data, palette="Set1", ax=ax[0])
sns.boxplot(y=data_n[col_name], data=data_n, palette="Set1", ax=ax[1])
return data_n
x_Train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149795 entries, 0 to 149794
Data columns (total 28 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 regDate 149795 non-null int64
1 model 149794 non-null float64
2 brand 149795 non-null int64
3 bodyType 145301 non-null float64
4 fuelType 141120 non-null float64
5 gearbox 143831 non-null float64
6 power 149795 non-null int64
7 kilometer 149795 non-null float64
8 regionCode 149795 non-null int64
9 seller 149795 non-null int64
10 offerType 149795 non-null int64
11 creatDate 149795 non-null int64
12 price 149795 non-null int64
13 v_0 149795 non-null float64
14 v_1 149795 non-null float64
15 v_2 149795 non-null float64
16 v_3 149795 non-null float64
17 v_4 149795 non-null float64
18 v_5 149795 non-null float64
19 v_6 149795 non-null float64
20 v_7 149795 non-null float64
21 v_8 149795 non-null float64
22 v_9 149795 non-null float64
23 v_10 149795 non-null float64
24 v_11 149795 non-null float64
25 v_12 149795 non-null float64
26 v_13 149795 non-null float64
27 v_14 149795 non-null float64
dtypes: float64(20), int64(8)
memory usage: 32.0 MB
# x_Train_data = outliers_proc(x_Train_data,'model', scale=3)
# x_Train_data = outliers_proc(x_Train_data,'bodyType', scale=3)
# x_Train_data = outliers_proc(x_Train_data,'fuelType', scale=3)
Delete number is: 81
Now column number is: 149795
Description of data less than the lower bound is:
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: fuelType, dtype: float64
Description of data larger than the upper bound is:
count 81.000000
mean 5.444444
std 0.500000
min 5.000000
25% 5.000000
50% 5.000000
75% 6.000000
max 6.000000
Name: fuelType, dtype: float64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-H8dzwFxM-1618581223314)(output_21_1.png)]
x_Train_data['price']
0 1850
1 3600
2 6222
3 2400
4 5200
...
149790 5900
149791 9500
149792 7500
149793 4999
149794 4700
Name: price, Length: 149795, dtype: int64
#训练样本
x_train = Train_data[feature_cols]
y_train = Train_data['price']
#测试样本
x_test = TestA_data[feature_cols]
print('X train shape:',x_train.shape)
print('X test shape:',x_test.shape)
X train shape: (150000, 27)
X test shape: (50000, 27)
x_train = x_train.fillna(-1)
x_test = x_test.fillna(-1)
# 统计标签的基本分布信息
y_train.describe()
count 150000.000000
mean 5923.327333
std 7501.998477
min 11.000000
25% 1300.000000
50% 3250.000000
75% 7700.000000
max 99999.000000
Name: price, dtype: float64
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
x_train = reduce_mem_usage(x_train)
x_test = reduce_mem_usage(x_test)
Memory usage of dataframe is 30.90 MB
Memory usage after optimization is: 7.87 MB
Decreased by 74.5%
Memory usage of dataframe is 10.30 MB
Memory usage after optimization is: 2.62 MB
Decreased by 74.5%
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)
# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 将参数写成字典下形式
params = {
'task': 'train',
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'regression', # 目标函数
'metric': {'l2', 'auc'}, # 评估函数
'num_leaves': 127, # 叶子节点数
'learning_rate': 0.01, # 学习速率
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
# 训练 cv and train
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)
# 保存模型到文件
gbm.save_model('model.txt')
# 预测数据集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# 评估模型
print('The rmse of prediction is:', mean_absolute_error(y_test, y_pred))
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.090954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4977
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 25
[LightGBM] [Info] Start training from score 5919.941275
[1] valid_0's auc: 1 valid_0's l2: 5.65302e+07
Training until validation scores don't improve for 5 rounds
[2] valid_0's auc: 1 valid_0's l2: 5.54992e+07
[3] valid_0's auc: 1 valid_0's l2: 5.44915e+07
[4] valid_0's auc: 1 valid_0's l2: 5.35056e+07
[5] valid_0's auc: 1 valid_0's l2: 5.25345e+07
[6] valid_0's auc: 1 valid_0's l2: 5.15829e+07
Early stopping, best iteration is:
[1] valid_0's auc: 1 valid_0's l2: 5.65302e+07
The rmse of prediction is: 4966.258600208158
lgb_test = gbm.predict(x_test, num_iteration=gbm.best_iteration)
lgb_test
array([6159.74936094, 5866.91656302, 5924.70077607, ..., 5916.21964354,
5961.24691629, 5900.2553832 ])
sub = pd.read_csv('used_car_sample_submit.csv')
sub
SaleID | price | |
---|---|---|
0 | 150000 | 0 |
1 | 150001 | 0 |
2 | 150002 | 0 |
3 | 150003 | 0 |
4 | 150004 | 0 |
... | ... | ... |
49995 | 199995 | 0 |
49996 | 199996 | 0 |
49997 | 199997 | 0 |
49998 | 199998 | 0 |
49999 | 199999 | 0 |
50000 rows × 2 columns
sub['price'] = lgb_test
sub
SaleID | price | |
---|---|---|
0 | 150000 | 7878.594467 |
1 | 150001 | 5685.291192 |
2 | 150002 | 5999.232414 |
3 | 150003 | 5999.232414 |
4 | 150004 | 5685.291192 |
... | ... | ... |
49995 | 199995 | 5805.003001 |
49996 | 199996 | 5685.291192 |
49997 | 199997 | 6100.446686 |
49998 | 199998 | 6100.446686 |
49999 | 199999 | 5882.450434 |
50000 rows × 2 columns
sub.to_csv('sub_sample.csv',index=False)