河北高校邀请赛——二手车交易价格预测-Task3

## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
from tqdm import tqdm
import itertools

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import scipy.signal as signal
#处理异常值
def smooth_cols(group,out_value,kind):
    cols = ['power']
    if kind == 'g':
        for col in cols:
            yes_no = (group[col]<out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.995))
        return group
    if kind == 'l':
        for col in cols:
            yes_no = (group[col]>out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.07))
        return group        
def date_proc(x):
    m = int(x[4:6])
    if m == 0:
        m = 1
    return x[:4] + '-' + str(m) + '-' + x[6:]

#定义日期提取函数
def date_tran(df,fea_col):
    for f in tqdm(fea_col):
        df[f] = pd.to_datetime(df[f].astype('str').apply(date_proc))
        df[f + '_year'] = df[f].dt.year
        df[f + '_month'] = df[f].dt.month
        df[f + '_day'] = df[f].dt.day
        df[f + '_dayofweek'] = df[f].dt.dayofweek
    return (df)

#分桶操作
def cut_group(df,cols,num_bins=50):
    for col in cols:
        all_range = int(df[col].max()-df[col].min())
        bin = [i*all_range/num_bins for i in range(all_range)]
        df[col+'_bin'] = pd.cut(df[col], bin, labels=False)
    return df

### count编码
def count_coding(df,fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return(df)
#定义交叉特征统计
def cross_cat_num(df,num_col,cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
            })
            df = df.merge(feat, on=f1, how='left')
    return(df)
### 类别特征的二阶交叉
from scipy.stats import entropy
def cross_qua_cat_num(df):
    for f_pair in tqdm([
        ['model', 'brand'], ['model', 'regionCode'], ['brand', 'regionCode']
    ]):
        ### 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['SaleID'].transform('count')
        ### n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        ### 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return (df)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df
## 通过Pandas对于数据进行读取 (pandas是一个很友好的数据读取函数库)
Train_data = reduce_mem_usage(pd.read_csv('car_train_0110.csv', sep=' '))
TestA_data = reduce_mem_usage(pd.read_csv('car_testA_0110.csv', sep=' '))

#Train_data = Train_data[Train_data['price']>100]
#Train_data['price'] = np.log1p(Train_data['price'])
## 输出数据的大小信息
print('Train data shape:',Train_data.shape)
print('TestA data shape:',TestA_data.shape)


#合并数据集
concat_data = pd.concat([Train_data,TestA_data])
concat_data['notRepairedDamage'] = concat_data['notRepairedDamage'].replace('-',0).astype('float16')
#concat_data = concat_data.fillna(concat_data.mode().iloc[0,:])
#concat_data.index = range(200000)
#concat_data = concat_data.groupby('bodyType').apply(smooth_cols,out_value=600,kind='g')
#concat_data.index = range(200000)
#concat_data['power'] = np.log(concat_data['power'])
print('concat_data shape:',concat_data.shape)
Memory usage of dataframe is 80000128.00 MB
Memory usage after optimization is: 21750128.00 MB
Decreased by 72.8%
Memory usage of dataframe is 15600128.00 MB
Memory usage after optimization is: 4150128.00 MB
Decreased by 73.4%
Train data shape: (250000, 40)
TestA data shape: (50000, 39)
concat_data shape: (300000, 40)
#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             300000 non-null  int32  
 1   name               300000 non-null  int32  
 2   regDate            300000 non-null  int32  
 3   model              300000 non-null  float16
 4   brand              300000 non-null  int8   
 5   bodyType           269510 non-null  float16
 6   fuelType           273108 non-null  float16
 7   gearbox            283774 non-null  float16
 8   power              300000 non-null  int16  
 9   kilometer          300000 non-null  float16
 10  notRepairedDamage  241836 non-null  float16
 11  regionCode         300000 non-null  int16  
 12  seller             300000 non-null  int8   
 13  offerType          300000 non-null  int8   
 14  creatDate          300000 non-null  int32  
 15  price              250000 non-null  float64
 16  v_0                300000 non-null  float16
 17  v_1                300000 non-null  float16
 18  v_2                300000 non-null  float16
 19  v_3                300000 non-null  float16
 20  v_4                300000 non-null  float16
 21  v_5                300000 non-null  float16
 22  v_6                300000 non-null  float16
 23  v_7                300000 non-null  float16
 24  v_8                300000 non-null  float16
 25  v_9                300000 non-null  float16
 26  v_10               300000 non-null  float16
 27  v_11               300000 non-null  float16
 28  v_12               300000 non-null  float16
 29  v_13               300000 non-null  float16
 30  v_14               300000 non-null  float16
 31  v_15               300000 non-null  float16
 32  v_16               300000 non-null  float16
 33  v_17               300000 non-null  float16
 34  v_18               300000 non-null  float16
 35  v_19               300000 non-null  float16
 36  v_20               300000 non-null  float16
 37  v_21               300000 non-null  float16
 38  v_22               300000 non-null  float16
 39  v_23               300000 non-null  float16
dtypes: float16(30), float64(1), int16(2), int32(4), int8(3)
memory usage: 28.3 MB





SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType             30490
fuelType             26892
gearbox              16226
power                    0
kilometer                0
notRepairedDamage    58164
regionCode               0
seller                   0
offerType                0
creatDate                0
price                50000
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
dtype: int64
from  sklearn import ensemble
from sklearn.preprocessing import LabelEncoder
def set_missing(df,estimate_list,miss_col):
    """df要处理的数据帧,estimate_list用来估计缺失值的字段列表,miss_col缺失字段名称;会直接在原来的数据帧上修改"""
    col_list=estimate_list
    col_list.append(miss_col)   
    process_df = df.loc[:,col_list]
    class_le= LabelEncoder()
    for i in col_list[:-1]:
        process_df.loc[:,i]=class_le.fit_transform(process_df.loc[:,i].values)
    # 分成已知该特征和未知该特征两部分
    known=process_df[process_df[miss_col].notnull()].values
    known[:, -1]=class_le.fit_transform(known[:, -1])
    unknown = process_df[process_df[miss_col].isnull()].values
    # X为特征属性值
    X = known[:, :-1]
    # y为结果标签值
    y = known[:, -1]
    # fit到RandomForestRegressor之中
    rfr = ensemble.RandomForestRegressor(random_state=1, n_estimators=200,max_depth=4,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:, :-1]).round(0).astype(int)
    predicted=class_le.inverse_transform(predicted)
#     print(predicted)
    # 用得到的预测结果填补原缺失数据
    df.loc[(df[miss_col].isnull()), miss_col] = predicted
    return df
features = ['bodyType','v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21',
       'v_22', 'v_23','name', 'regDate', 'model', 'brand', 'fuelType',
        'gearbox', 'kilometer', 'notRepairedDamage', 'regionCode',
        'seller', 'offerType', 'creatDate', 'power', 'price']
estimate_list = ['v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 
                 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21', 'v_22', 'v_23','name', 'regDate', 'model', 
                 'brand', 'kilometer','regionCode', 'seller', 'offerType', 'creatDate', 'power', 'price']
miss_col ='bodyType'
set_missing(concat_data,estimate_list,miss_col)
miss_col ='fuelType'
set_missing(concat_data,estimate_list,miss_col)
miss_col ='gearbox'
set_missing(concat_data,estimate_list,miss_col)
miss_col ='notRepairedDamage'
set_missing(concat_data,estimate_list,miss_col)
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_14v_15v_16v_17v_18v_19v_20v_21v_22v_23
01348907342016000213.093.00.01.0015.0...0.0921630.00000018.765625-1.511719-1.008789-12.101562-0.9472669.0781250.5810553.945312
13066481969732008030772.097.05.01.017315.0...0.0010700.122314-5.687500-0.489990-2.224609-0.226807-0.658203-3.9492194.593750-1.145508
2340675253472002031218.0123.00.01.05012.5...0.0643920.003345-3.2949221.8164063.554688-0.6835940.9716802.625000-0.852051-1.246094
35733253822000061138.087.00.01.05415.0...0.0692140.000000-3.4062501.4980474.7812500.0390931.2275393.041016-0.801758-1.251953
42652351731742003010987.005.05.01.01313.0...0.0000990.001656-4.4765620.1241461.364258-0.319824-1.131836-3.302734-1.998047-1.279297
..................................................................
499953750333803200104076.0295.00.00.018610.0...0.0000000.000372-3.3984380.9399414.1171880.146362-2.349609-2.636719-0.965332-1.097656
499964065562850020071001130.0102.00.00.02727.0...0.0032080.116455-7.054688-1.259766-4.9375000.881348-1.589844-3.4960943.3027343.947266
49997511668983831998010223.0104.00.01.01900.5...0.0495910.067017-4.9179690.507812-0.0354610.2563480.7338870.7797851.8222665.011719
4999853313914892003100170.017.04.01.010115.0...0.0845950.000000-0.4245613.892578-0.1468511.83105518.015625-2.513672-3.310547-1.589844
499995928039942007040776.004.05.01.0015.0...0.0557250.110901-1.4228522.750000-2.1601560.83789117.671875-5.8007813.062500-1.308594

300000 rows × 40 columns

#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             300000 non-null  int32  
 1   name               300000 non-null  int32  
 2   regDate            300000 non-null  int32  
 3   model              300000 non-null  float16
 4   brand              300000 non-null  int8   
 5   bodyType           300000 non-null  float16
 6   fuelType           300000 non-null  float16
 7   gearbox            300000 non-null  float16
 8   power              300000 non-null  int16  
 9   kilometer          300000 non-null  float16
 10  notRepairedDamage  300000 non-null  float16
 11  regionCode         300000 non-null  int16  
 12  seller             300000 non-null  int8   
 13  offerType          300000 non-null  int8   
 14  creatDate          300000 non-null  int32  
 15  price              250000 non-null  float64
 16  v_0                300000 non-null  float16
 17  v_1                300000 non-null  float16
 18  v_2                300000 non-null  float16
 19  v_3                300000 non-null  float16
 20  v_4                300000 non-null  float16
 21  v_5                300000 non-null  float16
 22  v_6                300000 non-null  float16
 23  v_7                300000 non-null  float16
 24  v_8                300000 non-null  float16
 25  v_9                300000 non-null  float16
 26  v_10               300000 non-null  float16
 27  v_11               300000 non-null  float16
 28  v_12               300000 non-null  float16
 29  v_13               300000 non-null  float16
 30  v_14               300000 non-null  float16
 31  v_15               300000 non-null  float16
 32  v_16               300000 non-null  float16
 33  v_17               300000 non-null  float16
 34  v_18               300000 non-null  float16
 35  v_19               300000 non-null  float16
 36  v_20               300000 non-null  float16
 37  v_21               300000 non-null  float16
 38  v_22               300000 non-null  float16
 39  v_23               300000 non-null  float16
dtypes: float16(30), float64(1), int16(2), int32(4), int8(3)
memory usage: 28.3 MB





SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType                 0
fuelType                 0
gearbox                  0
power                    0
kilometer                0
notRepairedDamage        0
regionCode               0
seller                   0
offerType                0
creatDate                0
price                50000
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
dtype: int64
#截断异常值
concat_data['power'][concat_data['power']>600] = 600
concat_data['power'][concat_data['power']<1] = 1
#'name'有部分重复值,做一个简单统计
concat_data['name_count'] = concat_data.groupby(['name'])['SaleID'].transform('count')
# del concat_data['name']
# del concat_data['offerType']
# concat_data.drop(concat_data[concat_data['seller'] == 0].index, inplace=True)
# del concat_data['seller']
#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Data columns (total 41 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             300000 non-null  int32  
 1   name               300000 non-null  int32  
 2   regDate            300000 non-null  int32  
 3   model              300000 non-null  float16
 4   brand              300000 non-null  int8   
 5   bodyType           300000 non-null  float16
 6   fuelType           300000 non-null  float16
 7   gearbox            300000 non-null  float16
 8   power              300000 non-null  int16  
 9   kilometer          300000 non-null  float16
 10  notRepairedDamage  300000 non-null  float16
 11  regionCode         300000 non-null  int16  
 12  seller             300000 non-null  int8   
 13  offerType          300000 non-null  int8   
 14  creatDate          300000 non-null  int32  
 15  price              250000 non-null  float64
 16  v_0                300000 non-null  float16
 17  v_1                300000 non-null  float16
 18  v_2                300000 non-null  float16
 19  v_3                300000 non-null  float16
 20  v_4                300000 non-null  float16
 21  v_5                300000 non-null  float16
 22  v_6                300000 non-null  float16
 23  v_7                300000 non-null  float16
 24  v_8                300000 non-null  float16
 25  v_9                300000 non-null  float16
 26  v_10               300000 non-null  float16
 27  v_11               300000 non-null  float16
 28  v_12               300000 non-null  float16
 29  v_13               300000 non-null  float16
 30  v_14               300000 non-null  float16
 31  v_15               300000 non-null  float16
 32  v_16               300000 non-null  float16
 33  v_17               300000 non-null  float16
 34  v_18               300000 non-null  float16
 35  v_19               300000 non-null  float16
 36  v_20               300000 non-null  float16
 37  v_21               300000 non-null  float16
 38  v_22               300000 non-null  float16
 39  v_23               300000 non-null  float16
 40  name_count         300000 non-null  int64  
dtypes: float16(30), float64(1), int16(2), int32(4), int64(1), int8(3)
memory usage: 30.6 MB





SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType                 0
fuelType                 0
gearbox                  0
power                    0
kilometer                0
notRepairedDamage        0
regionCode               0
seller                   0
offerType                0
creatDate                0
price                50000
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
name_count               0
dtype: int64
i = 0
df2 = concat_data.copy()
while i < 24:
    j = i + 1
    while j < 24:
        df2['temp1']=concat_data['v_'+str(i)]*concat_data['v_'+str(j)]
        df2['temp2']=concat_data['v_'+str(i)]+concat_data['v_'+str(j)]
        df2['temp3']=concat_data['v_'+str(i)]-concat_data['v_'+str(j)]
        corr1 = abs(concat_data['price'].corr(df2['temp1']))
        corr2 = abs(concat_data['price'].corr(df2['temp2']))
        corr3 = abs(concat_data['price'].corr(df2['temp3']))
        if corr1 > 0.5:
            concat_data[str(i)+'*'+str(j)] = df2['temp1']
        if corr2 > 0.5:
            concat_data[str(i)+'+'+str(j)] = df2['temp2']
        if corr3 > 0.5:
            concat_data[str(i)+'-'+str(j)] = df2['temp3']
        j = j + 1
    i = i + 1
concat_data.replace(to_replace = '-', value = np.nan, inplace = True)
concat_data.fillna(concat_data.median(),inplace= True)
# for i in ['v_' +str(i) for i in range(23)]:
#     for j in ['v_' +str(i) for i in range(23)]:
#         concat_data[str(i)+'+'+str(j)] = concat_data[str(i)]+concat_data[str(j)]
for i in ['model','brand', 'bodyType', 'fuelType','gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode']:
    for j in ['v_' +str(i) for i in range(23)]:
        concat_data[str(i)+'*'+str(j)] = concat_data[i]*concat_data[j]    
concat_data.shape
(300000, 327)
#提取日期信息
date_cols = ['regDate', 'creatDate']
concat_data = date_tran(concat_data,date_cols)
100%|██████████| 2/2 [00:00<00:00,  2.35it/s]
data = concat_data.copy()

#count编码
count_list = ['regDate', 'creatDate', 'model', 'brand', 'regionCode','bodyType','fuelType','regDate_year', 'regDate_month', 'regDate_day',
       'regDate_dayofweek' , 'creatDate_month','creatDate_day', 'creatDate_dayofweek','kilometer']
       
data = count_coding(data,count_list)
#特征构造
# 使用时间:data['creatDate'] - data['regDate'],反应汽车使用时间,一般来说价格与使用时间成反比
# 不过要注意,数据里有时间出错的格式,所以我们需要 errors='coerce'
data['used_time1'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') - 
                            pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
data['used_time2'] = (pd.datetime.now() - pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days                        
data['used_time3'] = (pd.datetime.now() - pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') ).dt.days

#分桶操作
cut_cols = ['power']+['used_time1','used_time2','used_time3']
data = cut_group(data,cut_cols,50)
### 用数值特征对类别特征做统计刻画,随便挑了几个跟price相关性最高的匿名特征
cross_cat = ['model', 'brand','regDate_year']
cross_num = ['v_0','v_3', 'v_11', 'v_18','power']
data = cross_cat_num(data,cross_num,cross_cat)#一阶交叉
  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.20s/it][A
 40%|████      | 2/5 [00:01<00:01,  1.55it/s][A
 60%|██████    | 3/5 [00:01<00:00,  2.10it/s][A
 80%|████████  | 4/5 [00:01<00:00,  2.68it/s][A
100%|██████████| 5/5 [00:02<00:00,  2.27it/s][A
 33%|███▎      | 1/3 [00:02<00:04,  2.21s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  5.51it/s][A
 40%|████      | 2/5 [00:00<00:00,  4.47it/s][A
 60%|██████    | 3/5 [00:00<00:00,  3.68it/s][A
 80%|████████  | 4/5 [00:00<00:00,  3.90it/s][A
100%|██████████| 5/5 [00:01<00:00,  3.73it/s][A
 67%|██████▋   | 2/3 [00:03<00:01,  1.70s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.43it/s][A
 40%|████      | 2/5 [00:00<00:00,  3.63it/s][A
 60%|██████    | 3/5 [00:00<00:00,  4.24it/s][A
 80%|████████  | 4/5 [00:00<00:00,  4.57it/s][A
100%|██████████| 5/5 [00:01<00:00,  4.48it/s][A
100%|██████████| 3/3 [00:04<00:00,  1.56s/it]
## 选择特征列
numerical_cols = data.columns
#print(numerical_cols)

cat_fea = ['SaleID','offerType','seller']
feature_cols = [col for col in numerical_cols if col not in cat_fea]
feature_cols = [col for col in feature_cols if col not in ['price']]

## 提前特征列,标签列构造训练样本和测试样本
X_data = data.iloc[:len(Train_data),:][feature_cols]
Y_data = Train_data['price']
X_test  = data.iloc[len(Train_data):,:][feature_cols]
concat_data.shape
(300000, 335)
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold,KFold
from itertools import product
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg([('mean', 'mean'), ('beta', 'size')])
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new
class_list = ['model','brand','name','regionCode']+date_cols
MeanEnocodeFeature = class_list#声明需要平均数编码的特征
ME = MeanEncoder(MeanEnocodeFeature,target_type='regression') #声明平均数编码的类
X_data = ME.fit_transform(X_data,Y_data)#对训练数据集的X和y进行拟合
#x_train_fav = ME.fit_transform(x_train,y_train_fav)#对训练数据集的X和y进行拟合
X_test = ME.transform(X_test)#对测试集进行编码
X_data['price'] = Train_data['price']
from sklearn.model_selection import KFold

### target encoding目标编码,回归场景相对来说做目标编码的选择更多,不仅可以做均值编码,还可以做标准差编码、中位数编码等
enc_cols = []
stats_default_dict = {
    'max': X_data['price'].max(),
    'min': X_data['price'].min(),
    'median': X_data['price'].median(),
    'mean': X_data['price'].mean(),
    'sum': X_data['price'].sum(),
    'std': X_data['price'].std(),
    'skew': X_data['price'].skew(),
    'kurt': X_data['price'].kurt(),
    'mad': X_data['price'].mad()
}
### 暂且选择这三种编码
enc_stats = ['max','min','mean']
skf = KFold(n_splits=10, shuffle=True, random_state=42)
for f in tqdm(['regionCode','brand','regDate_year','creatDate_year','kilometer','model']):
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        X_data['{}_target_{}'.format(f, stat)] = 0
        X_test['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx, val_idx) in enumerate(skf.split(X_data, Y_data)):
        trn_x, val_x = X_data.iloc[trn_idx].reset_index(drop=True), X_data.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['price'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = X_test[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            X_data.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values 
            X_test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits
100%|██████████| 6/6 [00:14<00:00,  2.40s/it]
#查看数据信息
concat_data.info()
#查看缺失值
concat_data.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 49999
Columns: 335 entries, SaleID to creatDate_dayofweek
dtypes: datetime64[ns](2), float16(270), float32(46), float64(1), int16(2), int32(2), int64(9), int8(3)
memory usage: 241.2 MB





SaleID                 0
name                   0
regDate                0
model                  0
brand                  0
                      ..
regDate_dayofweek      0
creatDate_year         0
creatDate_month        0
creatDate_day          0
creatDate_dayofweek    0
Length: 335, dtype: int64
X_data.shape
(250000, 423)
X_test.shape
(50000, 422)
features1 = ['price','power', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'brand_target_min', 'kilometer_target_min']
features2 = ['power', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'brand_target_min', 'kilometer_target_min']
from sklearn.preprocessing import MinMaxScaler
# 特征归一化
scaler = MinMaxScaler()
scaler.fit(X_data[features1].values)
X_data = scaler.transform(X_data[features1].values)
scaler = MinMaxScaler()
scaler.fit(X_test[features2].values)
X_test = scaler.transform(X_test[features2].values)
output_path = 'user_data/'
nn_data = pd.DataFrame(X_data, columns=features1)
nn_data['price'] = np.array(Train_data['price'])
nn_data['SaleID'] = np.array(Train_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'train_nn10.csv', index=0, sep=' ')
(250000, 36)
output_path = 'user_data/'
nn_data = pd.DataFrame(X_test, columns=features2)
nn_data['SaleID'] = np.array(TestA_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'test_nn10.csv', index=0, sep=' ')
(50000, 35)


df = X_data.copy()
corr = df.corr(method='spearman')
feature_group = list(itertools.combinations(corr.columns, 2))
# print(feature_group)

# 删除相关性高的变量,调试好直接去主函数进行剔除
def filter_corr(corr, cutoff=0.7):
    cols = []
    for i,j in feature_group:
        if corr.loc[i, j] > cutoff:
#             print(i,j,corr.loc[i, j])
            i_avg = corr[i][corr[i] != 1].mean()
            j_avg = corr[j][corr[j] != 1].mean()
            if i_avg >= j_avg:
                if i not in features:
                    cols.append(i)
                else:
                    cols.append(j)
            else:
                if j not in features:
                    cols.append(j)
                else:
                    cols.append(i)
    return set(cols)

drop_cols = filter_corr(corr, cutoff=0.95)
X_data = X_data.drop(drop_cols,axis=1)
x_test = X_test.drop(drop_cols,axis=1)
features1 = X_data.columns
fea1 = []
for f in features1:
    fea1.append(f)
len(fea1)
284
print(fea1)
['name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'creatDate', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_21', 'v_23', 'name_count', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'model*v_1', 'model*v_2', 'model*v_3', 'model*v_4', 'model*v_6', 'model*v_8', 'model*v_9', 'model*v_10', 'model*v_11', 'model*v_12', 'model*v_13', 'model*v_14', 'model*v_15', 'model*v_17', 'model*v_18', 'model*v_19', 'model*v_20', 'model*v_22', 'brand*v_1', 'brand*v_2', 'brand*v_3', 'brand*v_4', 'brand*v_6', 'brand*v_8', 'brand*v_9', 'brand*v_10', 'brand*v_11', 'brand*v_12', 'brand*v_13', 'brand*v_14', 'brand*v_15', 'brand*v_17', 'brand*v_18', 'brand*v_19', 'brand*v_20', 'brand*v_22', 'bodyType*v_1', 'bodyType*v_4', 'bodyType*v_6', 'bodyType*v_8', 'bodyType*v_9', 'bodyType*v_10', 'bodyType*v_11', 'bodyType*v_12', 'bodyType*v_13', 'bodyType*v_14', 'bodyType*v_16', 'bodyType*v_20', 'bodyType*v_22', 'fuelType*v_1', 'fuelType*v_2', 'fuelType*v_3', 'fuelType*v_6', 'fuelType*v_8', 'fuelType*v_9', 'fuelType*v_10', 'fuelType*v_12', 'fuelType*v_13', 'fuelType*v_14', 'fuelType*v_15', 'fuelType*v_17', 'fuelType*v_18', 'fuelType*v_19', 'fuelType*v_20', 'fuelType*v_22', 'gearbox*v_0', 'gearbox*v_1', 'gearbox*v_2', 'gearbox*v_3', 'gearbox*v_4', 'gearbox*v_6', 'gearbox*v_8', 'gearbox*v_9', 'gearbox*v_10', 'gearbox*v_11', 'gearbox*v_12', 'gearbox*v_13', 'gearbox*v_14', 'gearbox*v_15', 'gearbox*v_16', 'gearbox*v_17', 'gearbox*v_18', 'gearbox*v_19', 'gearbox*v_20', 'gearbox*v_22', 'power*v_1', 'power*v_2', 'power*v_3', 'power*v_4', 'power*v_6', 'power*v_8', 'power*v_9', 'power*v_10', 'power*v_11', 'power*v_12', 'power*v_14', 'power*v_17', 'power*v_19', 'power*v_20', 'power*v_22', 'kilometer*v_0', 'kilometer*v_1', 'kilometer*v_6', 'kilometer*v_8', 'kilometer*v_9', 'kilometer*v_10', 'kilometer*v_11', 'kilometer*v_12', 'kilometer*v_13', 'kilometer*v_16', 'kilometer*v_20', 'kilometer*v_22', 'notRepairedDamage*v_0', 'notRepairedDamage*v_2', 'notRepairedDamage*v_3', 'notRepairedDamage*v_4', 'notRepairedDamage*v_6', 'notRepairedDamage*v_9', 'notRepairedDamage*v_10', 'notRepairedDamage*v_11', 'notRepairedDamage*v_12', 'notRepairedDamage*v_13', 'notRepairedDamage*v_14', 'notRepairedDamage*v_15', 'notRepairedDamage*v_16', 'notRepairedDamage*v_17', 'notRepairedDamage*v_18', 'notRepairedDamage*v_19', 'notRepairedDamage*v_20', 'notRepairedDamage*v_22', 'regionCode*v_1', 'regionCode*v_2', 'regionCode*v_3', 'regionCode*v_4', 'regionCode*v_6', 'regionCode*v_8', 'regionCode*v_9', 'regionCode*v_10', 'regionCode*v_11', 'regionCode*v_12', 'regionCode*v_13', 'regionCode*v_14', 'regionCode*v_17', 'regionCode*v_18', 'regionCode*v_19', 'regionCode*v_20', 'regionCode*v_22', 'regDate_year', 'regDate_month', 'regDate_day', 'regDate_dayofweek', 'creatDate_month', 'creatDate_day', 'creatDate_dayofweek', 'regDate_count', 'creatDate_count', 'model_count', 'brand_count', 'regionCode_count', 'bodyType_count', 'fuelType_count', 'regDate_year_count', 'regDate_month_count', 'regDate_day_count', 'regDate_dayofweek_count', 'creatDate_month_count', 'creatDate_day_count', 'creatDate_dayofweek_count', 'used_time3', 'used_time1_bin', 'used_time3_bin', 'model_v_0_max', 'model_v_0_min', 'model_v_0_median', 'model_v_3_max', 'model_v_3_min', 'model_v_3_median', 'model_v_11_max', 'model_v_11_min', 'model_v_11_median', 'model_v_18_max', 'model_v_18_min', 'model_power_max', 'model_power_min', 'model_power_median', 'brand_v_0_max', 'brand_v_0_min', 'brand_v_0_median', 'brand_v_3_max', 'brand_v_3_min', 'brand_v_11_max', 'brand_v_11_min', 'brand_v_11_median', 'brand_v_18_max', 'brand_v_18_min', 'brand_v_18_median', 'brand_power_max', 'brand_power_min', 'brand_power_median', 'regDate_year_v_0_max', 'regDate_year_v_0_min', 'regDate_year_v_0_median', 'regDate_year_v_3_max', 'regDate_year_v_3_min', 'regDate_year_v_3_median', 'regDate_year_v_11_max', 'regDate_year_v_11_min', 'regDate_year_v_11_median', 'regDate_year_v_18_max', 'regDate_year_v_18_min', 'regDate_year_power_max', 'regDate_year_power_min', 'regDate_year_power_median', 'model_pred', 'name_pred', 'regionCode_pred', 'creatDate_pred', 'price', 'regionCode_target_max', 'regionCode_target_min', 'regionCode_target_mean', 'brand_target_max', 'brand_target_min', 'brand_target_mean', 'regDate_year_target_max', 'regDate_year_target_min', 'creatDate_year_target_max', 'creatDate_year_target_min', 'creatDate_year_target_mean', 'kilometer_target_max', 'kilometer_target_min', 'kilometer_target_mean', 'model_target_max', 'model_target_min']
corrs = X_data.corr()
df3 = corrs['price']
df3.head
<bound method NDFrame.head of name                    -0.008057
model                    0.146305
brand                   -0.010830
bodyType                 0.027771
fuelType                 0.197623
                           ...   
kilometer_target_max    -0.128785
kilometer_target_min          NaN
kilometer_target_mean    0.448072
model_target_max         0.196083
model_target_min         0.096333
Name: price, Length: 282, dtype: float64>
print(df3.iloc[2])
-0.010829998028367609
features2 = df3.index
fea2 = []
for f in features2:
    fea2.append(f)
drops = []
n = len(fea2)
i = 0
while i < n:
    if abs(df3.iloc[i]) < 0.5:
        temp = fea2[i]
        drops.append(temp)
    i = i + 1
print(drops)
['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage', 'regionCode', 'v_1', 'v_2', 'v_4', 'v_5', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_19', 'v_21', 'v_23', 'name_count', 'model*v_1', 'model*v_2', 'model*v_3', 'model*v_4', 'model*v_6', 'model*v_8', 'model*v_9', 'model*v_10', 'model*v_11', 'model*v_12', 'model*v_13', 'model*v_14', 'model*v_15', 'model*v_17', 'model*v_18', 'model*v_19', 'model*v_20', 'model*v_22', 'brand*v_1', 'brand*v_2', 'brand*v_3', 'brand*v_4', 'brand*v_6', 'brand*v_8', 'brand*v_9', 'brand*v_10', 'brand*v_11', 'brand*v_12', 'brand*v_13', 'brand*v_14', 'brand*v_15', 'brand*v_17', 'brand*v_18', 'brand*v_19', 'brand*v_20', 'brand*v_22', 'bodyType*v_1', 'bodyType*v_4', 'bodyType*v_6', 'bodyType*v_8', 'bodyType*v_9', 'bodyType*v_10', 'bodyType*v_11', 'bodyType*v_12', 'bodyType*v_13', 'bodyType*v_14', 'bodyType*v_16', 'bodyType*v_20', 'bodyType*v_22', 'fuelType*v_1', 'fuelType*v_2', 'fuelType*v_3', 'fuelType*v_6', 'fuelType*v_8', 'fuelType*v_9', 'fuelType*v_10', 'fuelType*v_12', 'fuelType*v_13', 'fuelType*v_14', 'fuelType*v_15', 'fuelType*v_17', 'fuelType*v_18', 'fuelType*v_19', 'fuelType*v_20', 'fuelType*v_22', 'gearbox*v_0', 'gearbox*v_1', 'gearbox*v_2', 'gearbox*v_3', 'gearbox*v_4', 'gearbox*v_6', 'gearbox*v_8', 'gearbox*v_9', 'gearbox*v_10', 'gearbox*v_11', 'gearbox*v_12', 'gearbox*v_13', 'gearbox*v_14', 'gearbox*v_15', 'gearbox*v_16', 'gearbox*v_17', 'gearbox*v_18', 'gearbox*v_19', 'gearbox*v_20', 'gearbox*v_22', 'power*v_1', 'power*v_2', 'power*v_4', 'power*v_6', 'power*v_9', 'power*v_14', 'power*v_17', 'power*v_19', 'power*v_20', 'power*v_22', 'kilometer*v_0', 'kilometer*v_1', 'kilometer*v_6', 'kilometer*v_8', 'kilometer*v_9', 'kilometer*v_10', 'kilometer*v_11', 'kilometer*v_12', 'kilometer*v_13', 'kilometer*v_16', 'kilometer*v_20', 'kilometer*v_22', 'notRepairedDamage*v_0', 'notRepairedDamage*v_2', 'notRepairedDamage*v_4', 'notRepairedDamage*v_6', 'notRepairedDamage*v_9', 'notRepairedDamage*v_10', 'notRepairedDamage*v_11', 'notRepairedDamage*v_12', 'notRepairedDamage*v_13', 'notRepairedDamage*v_14', 'notRepairedDamage*v_15', 'notRepairedDamage*v_16', 'notRepairedDamage*v_17', 'notRepairedDamage*v_19', 'notRepairedDamage*v_20', 'notRepairedDamage*v_22', 'regionCode*v_1', 'regionCode*v_2', 'regionCode*v_3', 'regionCode*v_4', 'regionCode*v_6', 'regionCode*v_8', 'regionCode*v_9', 'regionCode*v_10', 'regionCode*v_11', 'regionCode*v_12', 'regionCode*v_13', 'regionCode*v_14', 'regionCode*v_17', 'regionCode*v_18', 'regionCode*v_19', 'regionCode*v_20', 'regionCode*v_22', 'regDate_year', 'regDate_month', 'regDate_day', 'regDate_dayofweek', 'creatDate_month', 'creatDate_day', 'creatDate_dayofweek', 'regDate_count', 'creatDate_count', 'model_count', 'brand_count', 'regionCode_count', 'bodyType_count', 'fuelType_count', 'regDate_year_count', 'regDate_month_count', 'regDate_day_count', 'regDate_dayofweek_count', 'creatDate_month_count', 'creatDate_day_count', 'creatDate_dayofweek_count', 'used_time3', 'used_time1_bin', 'used_time3_bin', 'model_v_0_max', 'model_v_0_min', 'model_v_0_median', 'model_v_3_max', 'model_v_3_min', 'model_v_3_median', 'model_v_11_max', 'model_v_11_min', 'model_v_11_median', 'model_v_18_max', 'model_v_18_min', 'model_power_max', 'model_power_min', 'model_power_median', 'brand_v_0_max', 'brand_v_0_min', 'brand_v_0_median', 'brand_v_3_max', 'brand_v_3_min', 'brand_v_11_max', 'brand_v_11_median', 'brand_v_18_max', 'brand_v_18_min', 'brand_v_18_median', 'brand_power_max', 'brand_power_median', 'regDate_year_v_0_max', 'regDate_year_v_0_min', 'regDate_year_v_0_median', 'regDate_year_v_3_max', 'regDate_year_v_3_min', 'regDate_year_v_11_max', 'regDate_year_v_11_min', 'regDate_year_v_18_max', 'regDate_year_v_18_min', 'regDate_year_power_max', 'regDate_year_power_min', 'regDate_year_power_median', 'name_pred', 'regionCode_pred', 'creatDate_pred', 'regionCode_target_max', 'regionCode_target_min', 'regionCode_target_mean', 'brand_target_max', 'brand_target_mean', 'regDate_year_target_max', 'regDate_year_target_min', 'creatDate_year_target_max', 'creatDate_year_target_min', 'creatDate_year_target_mean', 'kilometer_target_max', 'kilometer_target_mean', 'model_target_max', 'model_target_min']
X_data = X_data.drop(drops,axis=1)
features3 = X_data.columns
fea3 = []
for f in features3:
    fea3.append(f)
print(fea3)
['regDate', 'power', 'creatDate', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'price', 'brand_target_min', 'kilometer_target_min']
x_test = x_test.drop(drops,axis=1)
X_data = X_data.drop('creatDate', axis = 1)
X_data = X_data.drop('regDate', axis = 1)
x_test = x_test.drop('creatDate', axis = 1)
x_test = x_test.drop('regDate', axis = 1)
features1 = X_data.columns
fea1 = []
for f in features1:
    fea1.append(f)
features2 = x_test.columns
fea2 = []
for f in features2:
    fea2.append(f)
print(fea2)
['power', 'v_0', 'v_3', 'v_18', '1-16', '3-7', '3*8', '3*10', '3*11', '3*12', '3*15', '3+23', '3-23', '7-18', '8*18', '9-18', '11+14', '11+15', '11*18', '15*18', 'power*v_3', 'power*v_8', 'power*v_10', 'power*v_11', 'power*v_12', 'notRepairedDamage*v_3', 'notRepairedDamage*v_18', 'brand_v_11_min', 'brand_power_min', 'regDate_year_v_3_median', 'regDate_year_v_11_median', 'model_pred', 'brand_target_min', 'kilometer_target_min']
len(fea2)
34
from sklearn.preprocessing import MinMaxScaler
# 特征归一化
scaler = MinMaxScaler()
scaler.fit(X_data[fea1].values)
X_data = scaler.transform(X_data[fea1].values)
scaler = MinMaxScaler()
scaler.fit(x_test[fea2].values)
x_test = scaler.transform(x_test[fea2].values)
output_path = 'user_data/'
nn_data = pd.DataFrame(X_data, columns=fea1)
nn_data['price'] = np.array(Train_data['price'])
nn_data['SaleID'] = np.array(Train_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'train_nn10.csv', index=0, sep=' ')
(250000, 36)
output_path = 'user_data/'
nn_data = pd.DataFrame(x_test, columns=fea2)
nn_data['SaleID'] = np.array(TestA_data['SaleID'])
print(nn_data.shape)
nn_data.to_csv(output_path + 'test_nn10.csv', index=0, sep=' ')
(50000, 35)
# tree_data_path = 'user_data/'
# Train_NN_data = pd.read_csv(tree_data_path + 'train_nn10.csv', sep=' ')
# Test_NN_data = pd.read_csv(tree_data_path + 'test_nn10.csv', sep=' ')
# numerical_cols = Train_NN_data.columns
# feature_cols = [col for col in numerical_cols if col not in ['price','SaleID']]
# ## 提前特征列,标签列构造训练样本和测试样本
# X_data = Train_NN_data[feature_cols]
# X_test = Test_NN_data[feature_cols]

# # x = np.array(X_data)
# # y = np.array(Train_NN_data['price'])
# # x_ = np.array(X_test)

# x = X_data
# y = Train_NN_data['price']
# x_ = X_test


# # #切分数据集
# x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

# output_path = 'user_data/'
# x_train.to_csv(output_path + 'x_train.csv', index=0, sep=' ')
# x_test.to_csv(output_path + 'x_test.csv', index=0, sep=' ')
# y_train.to_csv(output_path + 'y_train.csv', index=0, sep=' ')
# y_test.to_csv(output_path + 'y_test.csv', index=0, sep=' ')
# x_.to_csv(output_path + 'x_.csv', index=0, sep=' ')
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值