task4 建模与调参

最新推荐文章于 2023-02-27 11:23:09 发布

learner-xz

最新推荐文章于 2023-02-27 11:23:09 发布

阅读量228

点赞数

分类专栏： datawhale之二手车交易价格预测

本文链接：https://blog.csdn.net/weixin_44104450/article/details/105254984

版权

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                 else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

sample_feature = reduce_mem_usage(pd.read_csv('G:/used_car/data_for_tree.csv'))

Memory usage of dataframe is 59.22 MB
Memory usage after optimization is: 15.75 MB
Decreased by 73.4%

continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model','brand']]

continuous_feature_names

['SaleID',
 'bodyType',
 'fuelType',
 'gearbox',
 'kilometer',
 'name',
 'notRepairedDamage',
 'offerType',
 'power',
 'seller',
 'train',
 'v_0',
 'v_1',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'used_time',
 'city',
 'brand_amount',
 'brand_price_max',
 'brand_price_median',
 'brand_price_min',
 'brand_price_sum',
 'brand_price_std',
 'brand_price_average',
 'power_bin']

sample_feature.head()

	SaleID	bodyType	brand	gearbox	kilometer	model	name	notRepairedDamage	...	used_time	city	brand_amount	brand_price_max	brand_price_median	brand_price_min	brand_price_sum	brand_price_std	brand_price_average	power_bin
0	0	1.0	6	0.0	12.5	30.0	736	0.0	...	4384.0	1.0	10192.0	35990.0	1800.0	13.0	36457520.0	4564.0	3576.0	5.0
1	1	2.0	1	0.0	15.0	40.0	2262	-	...	4756.0	4.0	13656.0	84000.0	6400.0	15.0	124044600.0	8992.0	9080.0	NaN
2	2	1.0	15	0.0	12.5	115.0	14874	0.0	...	4384.0	2.0	1458.0	45000.0	8496.0	100.0	14373814.0	5424.0	9848.0	16.0
3	3	0.0	10	1.0	15.0	109.0	71865	0.0	...	7124.0	NaN	13992.0	92900.0	5200.0	15.0	113034208.0	8248.0	8076.0	19.0
4	4	1.0	5	0.0	5.0	110.0	111080	0.0	...	1531.0	6.0	4664.0	31500.0	2300.0	20.0	15414322.0	3344.0	3306.0	6.0

5 rows × 39 columns

sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]

train_X = train[continuous_feature_names]
train_y = train['price']

from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)     #带标准化

model = model

最低0.47元/天解锁文章

learner-xz

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
task4 建模与调参

import pandas as pdimport numpy as npimport warningswarnings.filterwarnings('ignore')def reduce_mem_usage(df): """ iterate through all the columns of a dataframe and modify the data type ...
复制链接

扫一扫