import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
sample_feature = reduce_mem_usage(pd.read_csv('G:/used_car/data_for_tree.csv'))
Memory usage of dataframe is 59.22 MB
Memory usage after optimization is: 15.75 MB
Decreased by 73.4%
continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model','brand']]
continuous_feature_names
['SaleID',
'bodyType',
'fuelType',
'gearbox',
'kilometer',
'name',
'notRepairedDamage',
'offerType',
'power',
'seller',
'train',
'v_0',
'v_1',
'v_10',
'v_11',
'v_12',
'v_13',
'v_14',
'v_2',
'v_3',
'v_4',
'v_5',
'v_6',
'v_7',
'v_8',
'v_9',
'used_time',
'city',
'brand_amount',
'brand_price_max',
'brand_price_median',
'brand_price_min',
'brand_price_sum',
'brand_price_std',
'brand_price_average',
'power_bin']
sample_feature.head()
SaleID | bodyType | brand | fuelType | gearbox | kilometer | model | name | notRepairedDamage | offerType | ... | used_time | city | brand_amount | brand_price_max | brand_price_median | brand_price_min | brand_price_sum | brand_price_std | brand_price_average | power_bin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | 6 | 0.0 | 0.0 | 12.5 | 30.0 | 736 | 0.0 | 0 | ... | 4384.0 | 1.0 | 10192.0 | 35990.0 | 1800.0 | 13.0 | 36457520.0 | 4564.0 | 3576.0 | 5.0 |
1 | 1 | 2.0 | 1 | 0.0 | 0.0 | 15.0 | 40.0 | 2262 | - | 0 | ... | 4756.0 | 4.0 | 13656.0 | 84000.0 | 6400.0 | 15.0 | 124044600.0 | 8992.0 | 9080.0 | NaN |
2 | 2 | 1.0 | 15 | 0.0 | 0.0 | 12.5 | 115.0 | 14874 | 0.0 | 0 | ... | 4384.0 | 2.0 | 1458.0 | 45000.0 | 8496.0 | 100.0 | 14373814.0 | 5424.0 | 9848.0 | 16.0 |
3 | 3 | 0.0 | 10 | 0.0 | 1.0 | 15.0 | 109.0 | 71865 | 0.0 | 0 | ... | 7124.0 | NaN | 13992.0 | 92900.0 | 5200.0 | 15.0 | 113034208.0 | 8248.0 | 8076.0 | 19.0 |
4 | 4 | 1.0 | 5 | 0.0 | 0.0 | 5.0 | 110.0 | 111080 | 0.0 | 0 | ... | 1531.0 | 6.0 | 4664.0 | 31500.0 | 2300.0 | 20.0 | 15414322.0 | 3344.0 | 3306.0 | 6.0 |
5 rows × 39 columns
sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]
train_X = train[continuous_feature_names]
train_y = train['price']
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True) #带标准化
model = model