河北高校邀请赛——二手车交易价格预测-Task3-CSDN博客

本文链接：https://blog.csdn.net/liu_dongd/article/details/115894567

## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
from tqdm import tqdm
import itertools

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import scipy.signal as signal

#处理异常值
def smooth_cols(group,out_value,kind):
    cols = ['power']
    if kind == 'g':
        for col in cols:
            yes_no = (group[col]<out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.995))
        return group
    if kind == 'l':
        for col in cols:
            yes_no = (group[col]>out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.07))
        return group        
def date_proc(x):
    m = int(x[4:6])
    if m == 0:
        m = 1
    return x[:4] + '-' + str(m) + '-' + x[6:]

#定义日期提取函数
def date_tran(df,fea_col):
    for f in tqdm(fea_col):
        df[f] = pd.to_datetime(df[f].astype('str').apply(date_proc))
        df[f + '_year'] = df[f].dt.year
        df[f + '_month'] = df[f].dt.month
        df[f + '_day'] = df[f].dt.day
        df[f + '_dayofweek'] = df[f].dt.dayofweek
    return (df)

#分桶操作
def cut_group(df,cols,num_bins=50):
    for col in cols:
        all_range = int(df[col].max()-df[col].min())
        bin = [i*all_range/num_bins for i in range(all_range)]
        df[col+'_bin'] = pd.cut(df[col], bin, labels=False)
    return df

### count编码
def count_coding(df,fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return(df)
#定义交叉特征统计
def cross_cat_num(df,num_col,cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
            })
            df = df.merge(feat, on=f1, how='left')
    return(df)
### 类别特征的二阶交叉
from scipy.stats import entropy
def cross_qua_cat_num(df):
    for f_pair in tqdm([
        ['model', 'brand'], ['model', 'regionCode'], ['brand', 'regionCode']
    ]):
        ### 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['SaleID'].transform('count')
        ### n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        ### 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return (df)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

## 通过Pandas对于数据进行读取 (pandas是一个很友好的数据读取函数库)
Train_data = reduce_mem_usage(pd.read_csv('car_train_0110.csv', sep=' '))
TestA_data = reduce_mem_usage(pd.read_csv('car_testA_0110.csv', sep=' '))

#Train_data = Train_data[Train_data['price']>100]
#Train_data['price'] = np.log1p(Train_data['price'])
## 输出数据的大小信息
print('Train data shape:',Train_data.shape)
print('TestA data shape:',TestA_data.shape)


#合并数据集
concat_data = pd.concat([Train_data,TestA_data])
concat_data['notRepairedDamage'] = concat_data['notRepairedDamage'].replace('-',0).astype('float16')
#concat_data = concat_data.fillna(concat_data.mode().iloc[0,:])
#concat_data.index = range(200000)
#concat_data = concat_data.groupby('bodyType').apply(smooth_cols,out_value=600,kind='g')
#concat_data.index = range(200000)
#concat_data['power'] = np.log(concat_data['power'])
print('concat_data shape:',concat_data.shape)

Memory usage of dataframe is 80000128.00 MB
Memory usage after optimization is: 21750128.00 MB
Decreased by 72.8%
Memory usage of dataframe is 15600128.00 MB
Memory usage after optimization is: 4150128.00 MB
Decreased by 73.4%
Train data shape: (250000, 40)
TestA data shape: (50000, 39)
concat_data shape: (300000, 40)

#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             300000 non-null  int32  
 1   name               300000 non-null  int32  
 2   regDate            300000 non-null  int32  
 3   model              300000 non-null  float16
 4   brand              300000 non-null  int8   
 5   bodyType           269510 non-null  float16
 6   fuelType           273108 non-null  float16
 7   gearbox            283774 non-null  float16
 8   power              300000 non-null  int16  
 9   kilometer          300000 non-null  float16
 10  notRepairedDamage  241836 non-null  float16
 11  regionCode         300000 non-null  int16  
 12  seller             300000 non-null  int8   
 13  offerType          300000 non-null  int8   
 14  creatDate          300000 non-null  int32  
 15  price              250000 non-null  float64
 16  v_0                300000 non-null  float16
 17  v_1                300000 non-null  float16
 18  v_2                300000 non-null  float16
 19  v_3                300000 non-null  float16
 20  v_4                300000 non-null  float16
 21  v_5                300000 non-null  float16
 22  v_6                300000 non-null  float16
 23  v_7                300000 non-null  float16
 24  v_8                300000 non-null  float16
 25  v_9                300000 non-null  float16
 26  v_10               300000 non-null  float16
 27  v_11               300000 non-null  float16
 28  v_12               300000 non-null  float16
 29  v_13               300000 non-null  float16
 30  v_14               300000 non-null  float16
 31  v_15               300000 non-null  float16
 32  v_16               300000 non-null  float16
 33  v_17               300000 non-null  float16
 34  v_18               300000 non-null  float16
 35  v_19               300000 non-null  float16
 36  v_20               300000 non-null  float16
 37  v_21               300000 non-null  float16
 38  v_22               300000 non-null  float16
 39  v_23               300000 non-null  float16
dtypes: float16(30), float64(1), int16(2), int32(4), int8(3)
memory usage: 28.3 MB





SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType             30490
fuelType             26892
gearbox              16226
power                    0
kilometer                0
notRepairedDamage    58164
regionCode               0
seller                   0
offerType                0
creatDate                0
price                50000
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
dtype: int64

from  sklearn import ensemble
from sklearn.preprocessing import LabelEncoder
def set_missing(df,estimate_list,miss_col):
    """df要处理的数据帧，estimate_list用来估计缺失值的字段列表,miss_col缺失字段名称;会直接在原来的数据帧上修改"""
    col_list=estimate_list
    col_list.append(miss_col)   
    process_df = df.loc[:,col_list]
    class_le= LabelEncoder()
    for i in col_list[:-1]:
        process_df.loc[:,i]=class_le.fit_transform(process_df.loc[:,i].values)
    # 分成已知该特征和未知该特征两部分
    known=process_df[process_df[miss_col].notnull()].values
    known[:, -1]=class_le.fit_transform(known[:, -1])
    unknown = process_df[process_df[miss_col].isnull()].values
    # X为特征属性值
    X = known[:, :-1]
    # y为结果标签值
    y = known[:, -1]
    # fit到RandomForestRegressor之中
    rfr = ensemble.RandomForestRegressor(random_state=1, n_estimators=200,max_depth=4,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:, :-1]).round(0).astype(int)
    predicted=class_le.inverse_transform(predicted)
#     print(predicted)
    # 用得到的预测结果填补原缺失数据
    df.loc[(df[miss_col].isnull()), miss_col] = predicted
    return df

features = ['bodyType','v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21',
       'v_22', 'v_23','name', 'regDate', 'model', 'brand', 'fuelType',
        'gearbox', 'kilometer', 'notRepairedDamage', 'regionCode',
        'seller', 'offerType', 'creatDate', 'power', 'price']

estimate_list = ['v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 
                 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21', 'v_22', 'v_23','name', 'regDate', 'model', 
                 'brand', 'kilometer','regionCode', 'seller', 'offerType', 'creatDate', 'power', 'price']
miss_col ='bodyType'
set_missing(concat_data,estimate_list,miss_col)
miss_col ='fuelType'
set_missing(concat_data,estimate_list,miss_col)
miss_col ='gearbox'
set_missing(concat_data,estimate_list,miss_col)
miss_col ='notRepairedDamage'
set_missing(concat_data,estimate_list,miss_col)

	SaleID	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	...	v_14	v_15	v_16	v_17	v_18	v_19	v_20	v_21	v_22	v_23
0	134890	734	20160002	13.0	9	3.0	0.0	1.0	0	15.0	...	0.092163	0.000000	18.765625	-1.511719	-1.008789	-12.101562	-0.947266	9.078125	0.581055	3.945312
1	306648	196973	20080307	72.0	9	7.0	5.0	1.0	173	15.0	...	0.001070	0.122314	-5.687500	-0.489990	-2.224609	-0.226807	-0.658203	-3.949219	4.593750	-1.145508
2	340675	25347	20020312	18.0	12	3.0	0.0	1.0	50	12.5	...	0.064392	0.003345	-3.294922	1.816406	3.554688	-0.683594	0.971680	2.625000	-0.852051	-1.246094
3	57332	5382	20000611	38.0	8	7.0	0.0	1.0	54	15.0	...	0.069214	0.000000	-3.406250	1.498047	4.781250	0.039093	1.227539	3.041016	-0.801758	-1.251953
4	265235	173174	20030109	87.0	0	5.0	5.0	1.0	131	3.0	...	0.000099	0.001656	-4.476562	0.124146	1.364258	-0.319824	-1.131836	-3.302734	-1.998047	-1.279297
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
49995	375033	3803	20010407	6.0	29	5.0	0.0	0.0	186	10.0	...	0.000000	0.000372	-3.398438	0.939941	4.117188	0.146362	-2.349609	-2.636719	-0.965332	-1.097656
49996	406556	28500	20071001	130.0	10	2.0	0.0	0.0	272	7.0	...	0.003208	0.116455	-7.054688	-1.259766	-4.937500	0.881348	-1.589844	-3.496094	3.302734	3.947266
49997	511668	98383	19980102	23.0	10	4.0	0.0	1.0	190	0.5	...	0.049591	0.067017	-4.917969	0.507812	-0.035461	0.256348	0.733887	0.779785	1.822266	5.011719
49998	533139	1489	20031001	70.0	1	7.0	4.0	1.0	101	15.0	...	0.084595	0.000000	-0.424561	3.892578	-0.146851	1.831055	18.015625	-2.513672	-3.310547	-1.589844
49999	592803	994	20070407	76.0	0	4.0	5.0	1.0	0	15.0	...	0.055725	0.110901	-1.422852	2.750000	-2.160156	0.837891	17.671875	-5.800781	3.062500	-1.308594

300000 rows × 40 columns

#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             300000 non-null  int32  
 1   name               300000 non-null  int32  
 2   regDate            300000 non-null  int32  
 3   model              300000 non-null  float16
 4   brand              300000 non-null  int8   
 5   bodyType           300000 non-null  float16
 6   fuelType           300000 non-null  float16
 7   gearbox            300000 non-null  float16
 8   power              300000 non-null  int16  
 9   kilometer          300000 non-null  float16
 10  notRepairedDamage  300000 non-null  float16
 11  regionCode         300000 non-null  int16  
 12  seller             300000 non-null  int8   
 13  offerType          300000 non-null  int8   
 14  creatDate          300000 non-null  int32  
 15  price              250000 non-null  float64
 16  v_0                300000 non-null  float16
 17  v_1                300000 non-null  float16
 18  v_2                300000 non-null  float16
 19  v_3                300000 non-null  float16
 20  v_4                300000 non-null  float16
 21  v_5                300000 non-null  float16
 22  v_6                300000 non-null  float16
 23  v_7                300000 non-null  float16
 24  v_8                300000 non-null  float16
 25  v_9                300000 non-null  float16
 26  v_10               300000 non-null  float16
 27  v_11               300000 non-null  float16
 28  v_12               300000 non-null  float16
 29  v_13               300000 non-null  float16
 30  v_14               300000 non-null  float16
 31  v_15               300000 non-null  float16
 32  v_16               300000 non-null  float16
 33  v_17               300000 non-null  float16
 34  v_18               300000 non-null  float16
 35  v_19               300000 non-null  float16
 36  v_20               300000 non-null  float16
 37  v_21               300000 non-null  float16
 38  v_22               300000 non-null  float16
 39  v_23               300000 non-null  float16
dtypes: float16(30), float64(1), int16(2), int32(4), int8(3)
memory usage: 28.3 MB





SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType                 0
fuelType                 0
gearbox                  0
power                    0
kilometer                0
notRepairedDamage        0
regionCode               0
seller                   0
offerType                0
creatDate                0
price                50000
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
dtype: int64

#截断异常值
concat_data['power'][concat_data['power']>600] = 600
concat_data['power'][concat_data['power']<1] = 1

#'name'有部分重复值，做一个简单统计
concat_data['name_count'] = concat_data.groupby(['name'])['SaleID'].transform('count')
# del concat_data['name']
# del concat_data['offerType']
# concat_data.drop(concat_data[concat_data['seller'] == 0].index, inplace=True)
# del concat_data['seller']

#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Data columns (total 41 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             300000 non-null  int32  
 1   name               300000 non-null  int32  
 2   regDate            300000 non-null  int32  
 3   model              300000 non-null  float16
 4   brand              300000 non-null  int8   
 5   bodyType           300000 non-null  float16
 6   fuelType           300000 non-null  float16
 7   gearbox            300000 non-null  float16
 8   power              300000 non-null  int16  
 9   kilometer          300000 non-null  float16
 10  notRepairedDamage  300000 non-null  float16
 11  regionCode         300000 non-null  int16  
 12  seller             300000 non-null  int8   
 13  offerType          300000 non-null  int8   
 14  creatDate          300000 non-null  int32  
 15  price              250000 non-null  float64
 16  v_0                300000 non-null  float16
 17  v_1                300000 non-null  float16
 18  v_2                300000 non-null  float16
 19  v_3                300000 non-null  float16
 20  v_4                300000 non-null  float16
 21  v_5                300000 non-null  float16
 22  v_6                300000 non-null  float16
 23  v_7                300000 non-null  float16
 24  v_8                300000 non-null  float16
 25  v_9                300000 non-null  float16
 26  v_10               300000 non-null  float16
 27  v_11               300000 non-null  float16
 28  v_12               300000 non-null  float16
 29  v_13               300000 non-null  float16
 30  v_14               300000 non-null  float16
 31  v_15               300000 non-null  float16
 32  v_16               300000 non-null  float16
 33  v_17               300000 non-null  float16
 34  v_18               300000 non-null  float16
 35  v_19               300000 non-null  float16
 36  v_20               300000 non-null  float16
 37  v_21               300000 non-null  float16
 38  v_22               300000 non-null  float16
 39  v_23               300000 non-null  float16
 40  name_count         300000 non-null  int64  
dtypes: float16(30), float64(1), int16(2), int32(4), int64(1), int8(3)
memory usage: 30.6 MB





SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType                 0
fuelType                 0
gearbox                  0
power                    0
kilometer                0
notRepairedDamage        0
regionCode               0
seller                   0
offerType                0
creatDate                0
price                50000
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
name_count               0
dtype: int64

i = 0
df2 = concat_data.copy()
while i < 24:
    j = i + 1
    while j < 24:
        df2['temp1']=concat_data['v_'+str(i)]*concat_data['v_'+str(j)]
        df2['temp2']=concat_data['v_'+str(i)]+concat_data['v_'+str(j)]
        df2['temp3']=concat_data['v_'+str(i)]-concat_data['v_'+str(j)]
        corr1 = abs(concat_data['price'].corr(df2['temp1']))
        corr2 = abs(concat_data['price'].corr(df2['temp2']))
        corr3 = abs(concat_data['price'].corr(df2['temp3']))
        if corr1 > 0.5:
            concat_data[str(i)+'*'+str(j)] = df2['temp1']
        if corr2 > 0.5:
            concat_data[str(i)+'+'+str(j)] = df2['temp2']
        if corr3 > 0.5:
            concat_data[str(i)+'-'+str(j)] = df2['temp3']
        j = j + 1
    i = i + 1

concat_data.replace(to_replace = '-', value = np.nan, inplace = True)
concat_data.fillna(concat_data.median(),inplace= True)

# for i in ['v_' +str(i) for i in range(23)]:
#     for j in ['v_' +str(i) for i in range(23)]:
#         concat_data[str(i)+'+'+str(j)] = concat_data[str(i)]+concat_data[str(j)]
for i in ['model','brand', 'bodyType', 'fuelType','gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode']:
    for j in ['v_' +str(i) for i in range(23)]:
        concat_data[str(i)+'*'+str(j)] = concat_data[i]*concat_data[j]    
concat_data.shape

(300000, 327)

#提取日期信息
date_cols = ['regDate', 'creatDate']
concat_data = date_tran(concat_data,date_cols)

100%|██████████| 2/2 [00:00<00:00,  2.35it/s]

data = concat_data.copy()

#count编码
count_list = ['regDate', 'creatDate', 'model', 'brand', 'regionCode','bodyType','fuelType','regDate_year', 'regDate_month', 'regDate_day',
       'regDate_dayofweek' , 'creatDate_month','creatDate_day', 'creatDate_dayofweek','kilometer']
       
data = count_coding(data,count_list)

#特征构造
# 使用时间：data['creatDate'] - data['regDate']，反应汽车使用时间，一般来说价格与使用时间成反比
# 不过要注意，数据里有时间出错的格式，所以我们需要 errors='coerce'
data['used_time1'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') - 
                            pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
data['used_time2'] = (pd.datetime.now() - pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days                        
data['used_time3'] = (pd.datetime.now() - pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') ).dt.days

#分桶操作
cut_cols = ['power']+['used_time1','used_time2','used_time3']
data = cut_group(data,cut_cols,50)

### 用数值特征对类别特征做统计刻画，随便挑了几个跟price相关性最高的匿名特征
cross_cat = ['model', 'brand','regDate_year']
cross_num = ['v_0','v_3', 'v_11', 'v_18','power']
data = cross_cat_num(data,cross_num,cross_cat)#一阶交叉

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.20s/it][A
 40%|████      | 2/5 [00:01<00:01,  1.55it/s][A
 60%|██████    | 3/5 [00:01<00:00,  2.10it/s][A
 80%|████████  | 4/5 [00:01<00:00,  2.68it/s][A
100%|██████████| 5/5 [00:02<00:00,  2.27it/s][A
 33%|███▎      | 1/3 [00:02<00:04,  2.21s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  5.51it/s][A
 40%|████      | 2/5 [00:00<00:00,  4.47it/s][A
 60%|██████    | 3/5 [00:00<00:00,  3.68it/s][A
 80%|████████  | 4/5 [00:00<00:00,  3.90it/s][A
100%|██████████| 5/5 [00:01<00:00,  3.73it/s][A
 67%|██████▋   | 2/3 [00:03<00:01,  1.70s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.43it/s][A
 40%|████      | 2/5 [00:00<00:00,  3.63it/s][A
 60%|██████    | 3/5 [00:00<00:00,  4.24it/s][A
 80%|████████  | 4/5 [00:00<00:00,  4.57it/s][A
100%|██████████| 5/5 [00:01<00:00,  4.48it/s][A
100%|██████████| 3/3 [00:04<00:00,  1.56s/it]

## 选择特征列
numerical_cols = data.columns
#print(numerical_cols)

cat_fea = ['SaleID','offerType','seller']
feature_cols = [col for col in numerical_cols if col not in cat_fea]
feature_cols = [col for col in feature_cols if col not in ['price']]

## 提前特征列，标签列构造训练样本和测试样本
X_data = data.iloc[:len(Train_data),:][feature_cols]
Y_data = Train_data['price']
X_test  = data.iloc[len(Train_data):,:][feature_cols]

concat_data.shape

(300000, 335)

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold,KFold
from itertools import product
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg([('mean', 'mean'), ('beta', 'size')])
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

class_list = ['model','brand','name','regionCode']+date_cols
MeanEnocodeFeature = class_list#声明需要平均数编码的特征
ME = MeanEncoder(MeanEnocodeFeature,target_type='regression') #声明平均数编码的类
X_data = ME.fit_transform(X_data,Y_data)#对训练数据集的X和y进行拟合
#x_train_fav = ME.fit_transform(x_train,y_train_fav)#对训练数据集的X和y进行拟合
X_test = ME.transform(X_test)#对测试集进行编码

X_data['price'] = Train_data['price']

from sklearn.model_selection import KFold

### target encoding目标编码，回归场景相对来说做目标编码的选择更多，不仅可以做均值编码，还可以做标准差编码、中位数编码等
enc_cols = []
stats_default_dict = {
    'max': X_data['price'].max(),
    'min': X_data['price'].min(),
    'median': X_data['price'].median(),
    'mean': X_data['price'].mean(),
    'sum': X_data['price'].sum(),
    'std': X_data['price'].std(),
    'skew': X_data['price'].skew(),
    'kurt': X_data['price'].kurt(),
    'mad': X_data['price'].mad()
}
### 暂且选择这三种编码
enc_stats = ['max','min','mean']
skf = KFold(n_splits=10, shuffle=True, random_state=42)
for f in tqdm(['regionCode','brand','regDate_year','creatDate_year','kilometer','model']):
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        X_data['{}_target_{}'.format(f, stat)] = 0
        X_test['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx, val_idx) in enumerate(skf.split(X_data, Y_data)):
        trn_x, val_x = X_data.iloc[trn_idx].reset_index(drop=True), X_data.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['price'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = X_test[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            X_data.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values 
            X_test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

100%|██████████| 6/6 [00:14<00:00,  2.40s/it]

#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Columns: 335 entries, SaleID to creatDate_dayofweek
dtypes: datetime64[ns](2), float16(270), float32(46), float64(1), int16(2), int32(2), int64(9), int8(3)
memory usage: 241.2 MB





SaleID                 0
name                   0
regDate                0
model                  0
brand                  0
                      ..
regDate_dayofweek      0
creatDate_year         0
creatDate_month        0
creatDate_day          0
creatDate_dayofweek    0
Length: 335, dtype: int64

X_data.shape

(250000, 423)

X_test.shape

(50000, 422)

features1 = ['price','power', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'brand_target_min', 'kilometer_target_min']

features2 = ['power', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'brand_target_min', 'kilometer_target_min']

from sklearn.preprocessing import MinMaxScaler
# 特征归一化
scaler = MinMaxScaler()
scaler.fit(X_data[features1].values)
X_data = scaler.transform(X_data[features1].values)

scaler = MinMaxScaler()
scaler.fit(X_test[features2].values)
X_test = scaler.transform(X_test[features2].values)

output_path = 'user_data/'
nn_data = pd.DataFrame(X_data, columns=features1)
nn_data['price'] = np.array(Train_data['price'])
nn_data['SaleID'] = np.array(Train_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'train_nn10.csv', index=0, sep=' ')

(250000, 36)

output_path = 'user_data/'
nn_data = pd.DataFrame(X_test, columns=features2)
nn_data['SaleID'] = np.array(TestA_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'test_nn10.csv', index=0, sep=' ')

(50000, 35)

df = X_data.copy()
corr = df.corr(method='spearman')
feature_group = list(itertools.combinations(corr.columns, 2))
# print(feature_group)

# 删除相关性高的变量,调试好直接去主函数进行剔除
def filter_corr(corr, cutoff=0.7):
    cols = []
    for i,j in feature_group:
        if corr.loc[i, j] > cutoff:
#             print(i,j,corr.loc[i, j])
            i_avg = corr[i][corr[i] != 1].mean()
            j_avg = corr[j][corr[j] != 1].mean()
            if i_avg >= j_avg:
                if i not in features:
                    cols.append(i)
                else:
                    cols.append(j)
            else:
                if j not in features:
                    cols.append(j)
                else:
                    cols.append(i)
    return set(cols)

drop_cols = filter_corr(corr, cutoff=0.95)

X_data = X_data.drop(drop_cols,axis=1)

x_test = X_test.drop(drop_cols,axis=1)

features1 = X_data.columns
fea1 = []
for f in features1:
    fea1.append(f)

len(fea1)

print(fea1)

['name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'creatDate', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_21', 'v_23', 'name_count', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'model*v_1', 'model*v_2', 'model*v_3', 'model*v_4', 'model*v_6', 'model*v_8', 'model*v_9', 'model*v_10', 'model*v_11', 'model*v_12', 'model*v_13', 'model*v_14', 'model*v_15', 'model*v_17', 'model*v_18', 'model*v_19', 'model*v_20', 'model*v_22', 'brand*v_1', 'brand*v_2', 'brand*v_3', 'brand*v_4', 'brand*v_6', 'brand*v_8', 'brand*v_9', 'brand*v_10', 'brand*v_11', 'brand*v_12', 'brand*v_13', 'brand*v_14', 'brand*v_15', 'brand*v_17', 'brand*v_18', 'brand*v_19', 'brand*v_20', 'brand*v_22', 'bodyType*v_1', 'bodyType*v_4', 'bodyType*v_6', 'bodyType*v_8', 'bodyType*v_9', 'bodyType*v_10', 'bodyType*v_11', 'bodyType*v_12', 'bodyType*v_13', 'bodyType*v_14', 'bodyType*v_16', 'bodyType*v_20', 'bodyType*v_22', 'fuelType*v_1', 'fuelType*v_2', 'fuelType*v_3', 'fuelType*v_6', 'fuelType*v_8', 'fuelType*v_9', 'fuelType*v_10', 'fuelType*v_12', 'fuelType*v_13', 'fuelType*v_14', 'fuelType*v_15', 'fuelType*v_17', 'fuelType*v_18', 'fuelType*v_19', 'fuelType*v_20', 'fuelType*v_22', 'gearbox*v_0', 'gearbox*v_1', 'gearbox*v_2', 'gearbox*v_3', 'gearbox*v_4', 'gearbox*v_6', 'gearbox*v_8', 'gearbox*v_9', 'gearbox*v_10', 'gearbox*v_11', 'gearbox*v_12', 'gearbox*v_13', 'gearbox*v_14', 'gearbox*v_15', 'gearbox*v_16', 'gearbox*v_17', 'gearbox*v_18', 'gearbox*v_19', 'gearbox*v_20', 'gearbox*v_22', 'power*v_1', 'power*v_2', 'power*v_3', 'power*v_4', 'power*v_6', 'power*v_8', 'power*v_9', 'power*v_10', 'power*v_11', 'power*v_12', 'power*v_14', 'power*v_17', 'power*v_19', 'power*v_20', 'power*v_22', 'kilometer*v_0', 'kilometer*v_1', 'kilometer*v_6', 'kilometer*v_8', 'kilometer*v_9', 'kilometer*v_10', 'kilometer*v_11', 'kilometer*v_12', 'kilometer*v_13', 'kilometer*v_16', 'kilometer*v_20', 'kilometer*v_22', 'notRepairedDamage*v_0', 'notRepairedDamage*v_2', 'notRepairedDamage*v_3', 'notRepairedDamage*v_4', 'notRepairedDamage*v_6', 'notRepairedDamage*v_9', 'notRepairedDamage*v_10', 'notRepairedDamage*v_11', 'notRepairedDamage*v_12', 'notRepairedDamage*v_13', 'notRepairedDamage*v_14', 'notRepairedDamage*v_15', 'notRepairedDamage*v_16', 'notRepairedDamage*v_17', 'notRepairedDamage*v_18', 'notRepairedDamage*v_19', 'notRepairedDamage*v_20', 'notRepairedDamage*v_22', 'regionCode*v_1', 'regionCode*v_2', 'regionCode*v_3', 'regionCode*v_4', 'regionCode*v_6', 'regionCode*v_8', 'regionCode*v_9', 'regionCode*v_10', 'regionCode*v_11', 'regionCode*v_12', 'regionCode*v_13', 'regionCode*v_14', 'regionCode*v_17', 'regionCode*v_18', 'regionCode*v_19', 'regionCode*v_20', 'regionCode*v_22', 'regDate_year', 'regDate_month', 'regDate_day', 'regDate_dayofweek', 'creatDate_month', 'creatDate_day', 'creatDate_dayofweek', 'regDate_count', 'creatDate_count', 'model_count', 'brand_count', 'regionCode_count', 'bodyType_count', 'fuelType_count', 'regDate_year_count', 'regDate_month_count', 'regDate_day_count', 'regDate_dayofweek_count', 'creatDate_month_count', 'creatDate_day_count', 'creatDate_dayofweek_count', 'used_time3', 'used_time1_bin', 'used_time3_bin', 'model_v_0_max', 'model_v_0_min', 'model_v_0_median', 'model_v_3_max', 'model_v_3_min', 'model_v_3_median', 'model_v_11_max', 'model_v_11_min', 'model_v_11_median', 'model_v_18_max', 'model_v_18_min', 'model_power_max', 'model_power_min', 'model_power_median', 'brand_v_0_max', 'brand_v_0_min', 'brand_v_0_median', 'brand_v_3_max', 'brand_v_3_min', 'brand_v_11_max', 'brand_v_11_min', 'brand_v_11_median', 'brand_v_18_max', 'brand_v_18_min', 'brand_v_18_median', 'brand_power_max', 'brand_power_min', 'brand_power_median', 'regDate_year_v_0_max', 'regDate_year_v_0_min', 'regDate_year_v_0_median', 'regDate_year_v_3_max', 'regDate_year_v_3_min', 'regDate_year_v_3_median', 'regDate_year_v_11_max', 'regDate_year_v_11_min', 'regDate_year_v_11_median', 'regDate_year_v_18_max', 'regDate_year_v_18_min', 'regDate_year_power_max', 'regDate_year_power_min', 'regDate_year_power_median', 'model_pred', 'name_pred', 'regionCode_pred', 'creatDate_pred', 'price', 'regionCode_target_max', 'regionCode_target_min', 'regionCode_target_mean', 'brand_target_max', 'brand_target_min', 'brand_target_mean', 'regDate_year_target_max', 'regDate_year_target_min', 'creatDate_year_target_max', 'creatDate_year_target_min', 'creatDate_year_target_mean', 'kilometer_target_max', 'kilometer_target_min', 'kilometer_target_mean', 'model_target_max', 'model_target_min']

corrs = X_data.corr()

df3 = corrs['price']

df3.head

<bound method NDFrame.head of name                    -0.008057
model                    0.146305
brand                   -0.010830
bodyType                 0.027771
fuelType                 0.197623
                           ...   
kilometer_target_max    -0.128785
kilometer_target_min          NaN
kilometer_target_mean    0.448072
model_target_max         0.196083
model_target_min         0.096333
Name: price, Length: 282, dtype: float64>

print(df3.iloc[2])

-0.010829998028367609

features2 = df3.index
fea2 = []
for f in features2:
    fea2.append(f)

drops = []
n = len(fea2)
i = 0
while i < n:
    if abs(df3.iloc[i]) < 0.5:
        temp = fea2[i]
        drops.append(temp)
    i = i + 1

print(drops)

['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage', 'regionCode', 'v_1', 'v_2', 'v_4', 'v_5', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_19', 'v_21', 'v_23', 'name_count', 'model*v_1', 'model*v_2', 'model*v_3', 'model*v_4', 'model*v_6', 'model*v_8', 'model*v_9', 'model*v_10', 'model*v_11', 'model*v_12', 'model*v_13', 'model*v_14', 'model*v_15', 'model*v_17', 'model*v_18', 'model*v_19', 'model*v_20', 'model*v_22', 'brand*v_1', 'brand*v_2', 'brand*v_3', 'brand*v_4', 'brand*v_6', 'brand*v_8', 'brand*v_9', 'brand*v_10', 'brand*v_11', 'brand*v_12', 'brand*v_13', 'brand*v_14', 'brand*v_15', 'brand*v_17', 'brand*v_18', 'brand*v_19', 'brand*v_20', 'brand*v_22', 'bodyType*v_1', 'bodyType*v_4', 'bodyType*v_6', 'bodyType*v_8', 'bodyType*v_9', 'bodyType*v_10', 'bodyType*v_11', 'bodyType*v_12', 'bodyType*v_13', 'bodyType*v_14', 'bodyType*v_16', 'bodyType*v_20', 'bodyType*v_22', 'fuelType*v_1', 'fuelType*v_2', 'fuelType*v_3', 'fuelType*v_6', 'fuelType*v_8', 'fuelType*v_9', 'fuelType*v_10', 'fuelType*v_12', 'fuelType*v_13', 'fuelType*v_14', 'fuelType*v_15', 'fuelType*v_17', 'fuelType*v_18', 'fuelType*v_19', 'fuelType*v_20', 'fuelType*v_22', 'gearbox*v_0', 'gearbox*v_1', 'gearbox*v_2', 'gearbox*v_3', 'gearbox*v_4', 'gearbox*v_6', 'gearbox*v_8', 'gearbox*v_9', 'gearbox*v_10', 'gearbox*v_11', 'gearbox*v_12', 'gearbox*v_13', 'gearbox*v_14', 'gearbox*v_15', 'gearbox*v_16', 'gearbox*v_17', 'gearbox*v_18', 'gearbox*v_19', 'gearbox*v_20', 'gearbox*v_22', 'power*v_1', 'power*v_2', 'power*v_4', 'power*v_6', 'power*v_9', 'power*v_14', 'power*v_17', 'power*v_19', 'power*v_20', 'power*v_22', 'kilometer*v_0', 'kilometer*v_1', 'kilometer*v_6', 'kilometer*v_8', 'kilometer*v_9', 'kilometer*v_10', 'kilometer*v_11', 'kilometer*v_12', 'kilometer*v_13', 'kilometer*v_16', 'kilometer*v_20', 'kilometer*v_22', 'notRepairedDamage*v_0', 'notRepairedDamage*v_2', 'notRepairedDamage*v_4', 'notRepairedDamage*v_6', 'notRepairedDamage*v_9', 'notRepairedDamage*v_10', 'notRepairedDamage*v_11', 'notRepairedDamage*v_12', 'notRepairedDamage*v_13', 'notRepairedDamage*v_14', 'notRepairedDamage*v_15', 'notRepairedDamage*v_16', 'notRepairedDamage*v_17', 'notRepairedDamage*v_19', 'notRepairedDamage*v_20', 'notRepairedDamage*v_22', 'regionCode*v_1', 'regionCode*v_2', 'regionCode*v_3', 'regionCode*v_4', 'regionCode*v_6', 'regionCode*v_8', 'regionCode*v_9', 'regionCode*v_10', 'regionCode*v_11', 'regionCode*v_12', 'regionCode*v_13', 'regionCode*v_14', 'regionCode*v_17', 'regionCode*v_18', 'regionCode*v_19', 'regionCode*v_20', 'regionCode*v_22', 'regDate_year', 'regDate_month', 'regDate_day', 'regDate_dayofweek', 'creatDate_month', 'creatDate_day', 'creatDate_dayofweek', 'regDate_count', 'creatDate_count', 'model_count', 'brand_count', 'regionCode_count', 'bodyType_count', 'fuelType_count', 'regDate_year_count', 'regDate_month_count', 'regDate_day_count', 'regDate_dayofweek_count', 'creatDate_month_count', 'creatDate_day_count', 'creatDate_dayofweek_count', 'used_time3', 'used_time1_bin', 'used_time3_bin', 'model_v_0_max', 'model_v_0_min', 'model_v_0_median', 'model_v_3_max', 'model_v_3_min', 'model_v_3_median', 'model_v_11_max', 'model_v_11_min', 'model_v_11_median', 'model_v_18_max', 'model_v_18_min', 'model_power_max', 'model_power_min', 'model_power_median', 'brand_v_0_max', 'brand_v_0_min', 'brand_v_0_median', 'brand_v_3_max', 'brand_v_3_min', 'brand_v_11_max', 'brand_v_11_median', 'brand_v_18_max', 'brand_v_18_min', 'brand_v_18_median', 'brand_power_max', 'brand_power_median', 'regDate_year_v_0_max', 'regDate_year_v_0_min', 'regDate_year_v_0_median', 'regDate_year_v_3_max', 'regDate_year_v_3_min', 'regDate_year_v_11_max', 'regDate_year_v_11_min', 'regDate_year_v_18_max', 'regDate_year_v_18_min', 'regDate_year_power_max', 'regDate_year_power_min', 'regDate_year_power_median', 'name_pred', 'regionCode_pred', 'creatDate_pred', 'regionCode_target_max', 'regionCode_target_min', 'regionCode_target_mean', 'brand_target_max', 'brand_target_mean', 'regDate_year_target_max', 'regDate_year_target_min', 'creatDate_year_target_max', 'creatDate_year_target_min', 'creatDate_year_target_mean', 'kilometer_target_max', 'kilometer_target_mean', 'model_target_max', 'model_target_min']

X_data = X_data.drop(drops,axis=1)

features3 = X_data.columns
fea3 = []
for f in features3:
    fea3.append(f)
print(fea3)

['regDate', 'power', 'creatDate', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'price', 'brand_target_min', 'kilometer_target_min']

x_test = x_test.drop(drops,axis=1)

X_data = X_data.drop('creatDate', axis = 1)
X_data = X_data.drop('regDate', axis = 1)

x_test = x_test.drop('creatDate', axis = 1)
x_test = x_test.drop('regDate', axis = 1)

features1 = X_data.columns
fea1 = []
for f in features1:
    fea1.append(f)

features2 = x_test.columns
fea2 = []
for f in features2:
    fea2.append(f)

print(fea2)

['power', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'brand_target_min', 'kilometer_target_min']

len(fea2)

from sklearn.preprocessing import MinMaxScaler
# 特征归一化
scaler = MinMaxScaler()
scaler.fit(X_data[fea1].values)
X_data = scaler.transform(X_data[fea1].values)

scaler = MinMaxScaler()
scaler.fit(x_test[fea2].values)
x_test = scaler.transform(x_test[fea2].values)

output_path = 'user_data/'
nn_data = pd.DataFrame(X_data, columns=fea1)
nn_data['price'] = np.array(Train_data['price'])
nn_data['SaleID'] = np.array(Train_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'train_nn10.csv', index=0, sep=' ')

(250000, 36)

output_path = 'user_data/'
nn_data = pd.DataFrame(x_test, columns=fea2)
nn_data['SaleID'] = np.array(TestA_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'test_nn10.csv', index=0, sep=' ')

(50000, 35)

# tree_data_path = 'user_data/'
# Train_NN_data = pd.read_csv(tree_data_path + 'train_nn10.csv', sep=' ')
# Test_NN_data = pd.read_csv(tree_data_path + 'test_nn10.csv', sep=' ')

# numerical_cols = Train_NN_data.columns
# feature_cols = [col for col in numerical_cols if col not in ['price','SaleID']]
# ## 提前特征列，标签列构造训练样本和测试样本
# X_data = Train_NN_data[feature_cols]
# X_test = Test_NN_data[feature_cols]

# # x = np.array(X_data)
# # y = np.array(Train_NN_data['price'])
# # x_ = np.array(X_test)

# x = X_data
# y = Train_NN_data['price']
# x_ = X_test


# # #切分数据集
# x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

# output_path = 'user_data/'
# x_train.to_csv(output_path + 'x_train.csv', index=0, sep=' ')
# x_test.to_csv(output_path + 'x_test.csv', index=0, sep=' ')
# y_train.to_csv(output_path + 'y_train.csv', index=0, sep=' ')
# y_test.to_csv(output_path + 'y_test.csv', index=0, sep=' ')
# x_.to_csv(output_path + 'x_.csv', index=0, sep=' ')