import pandas as pd
import numpy as np
背景
一个汽车的数据集,这次简单的任务是通过制造品牌方,颜色,里程表(KM),和车门的数量来估计汽车的价格
数据查看和缺失值的清理
car_sales_missing = pd.read_csv(r"\Users\Administrator\Desktop\data\car-sales-extended-missing-data.csv")
car_sales_missing.head()
Make | Colour | Odometer (KM) | Doors | Price | |
---|---|---|---|---|---|
0 | Honda | White | 35431.0 | 4.0 | 15323.0 |
1 | BMW | Blue | 192714.0 | 5.0 | 19943.0 |
2 | Honda | White | 84714.0 | 4.0 | 28343.0 |
3 | Toyota | White | 154365.0 | 4.0 | 13434.0 |
4 | Nissan | Blue | 181577.0 | 3.0 | 14043.0 |
car_sales_missing.isnull().sum()
Make 49
Colour 50
Odometer (KM) 50
Doors 50
Price 50
dtype: int64
car_sales_missing.dtypes
Make object
Colour object
Odometer (KM) float64
Doors float64
Price float64
dtype: object
car_sales_missing["Doors"].value_counts()
4.0 811
5.0 75
3.0 64
Name: Doors, dtype: int64
虽然在数据集中,'Doors'
是 float64
的类型,从数据来看,它是离散的,所以需要当作分类标签,待会用热编码进行数字化
方法 1: 用pandas填充缺失值
# 缺失值填充,其中 'Make' 和 'Colour' 用 'missing' 填充,而 "Odometer (KM)" 用的是平均值,"Doors" 用的是众数
car_sales_missing['Make'].fillna('missing', inplace = True)
car_sales_missing['Colour'].fillna('missing', inplace = True)
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)
car_sales_missing["Doors"].fillna(4, inplace=True)
查看填充的效果
car_sales_missing.isna().sum()
Make 0
Colour 0
Odometer (KM) 0
Doors 0
Price 50
dtype: int64
丢掉缺失值,因为是要对 'Price'
预测
car_sales_missing.dropna(inplace=True)
这下数据集中就没有缺失值了
car_sales_missing.isna().sum()
Make 0
Colour 0
Odometer (KM) 0
Doors 0
Price 0
dtype: int64
# 将数据分割成特征集和标签,也就是 `X` 和 `y`
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]
# 将数据全部转化成numerical
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
one_hot,
categorical_features)],
remainder="passthrough")
transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
3.54310e+04, 1.53230e+04],
[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
1.92714e+05, 1.99430e+04],
[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
8.47140e+04, 2.83430e+04],
...,
[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
6.66040e+04, 3.15700e+04],
[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
2.15883e+05, 4.00100e+03],
[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
2.48360e+05, 1.27320e+04]])
方法 2: 调用sklearn中的方法去转化缺失值
上面是对整个数据集进行了处理,现在利用sklearn
的方法划分数据集和测试集,然后分别处理处理缺失值
car_sales_missing = pd.read_csv(r"\Users\Administrator\Desktop\data\car-sales-extended-missing-data.csv")
car_sales_missing.head()
Make | Colour | Odometer (KM) | Doors | Price | |
---|---|---|---|---|---|
0 | Honda | White | 35431.0 | 4.0 | 15323.0 |
1 | BMW | Blue | 192714.0 | 5.0 | 19943.0 |
2 | Honda | White | 84714.0 | 4.0 | 28343.0 |
3 | Toyota | White | 154365.0 | 4.0 | 13434.0 |
4 | Nissan | Blue | 181577.0 | 3.0 | 14043.0 |
car_sales_missing.isna().sum()
Make 49
Colour 50
Odometer (KM) 50
Doors 50
Price 50
dtype: int64
car_sales_missing.dropna(subset = ['Price'], inplace = True)
car_sales_missing.isna().sum()
Make 47
Colour 46
Odometer (KM) 48
Doors 47
Price 0
dtype: int64
X = car_sales_missing.drop('Price', axis = 1)
y = car_sales_missing['Price']
from sklearn.model_selection import train_test_split
# 数据集的划分, 0.2是测试集占总体的数量比
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X.isna().sum()
Make 47
Colour 46
Odometer (KM) 48
Doors 47
dtype: int64
# 用sklearn方法来处理缺失值
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
# 填充缺失值
cat_imputer = SimpleImputer(strategy = 'constant', fill_value = 'missing')
door_imputer = SimpleImputer(strategy = 'constant', fill_value = 4)
num_imputer = SimpleImputer(strategy = 'mean')
# 定义对应的列
cat_features = ['Make', 'Colour']
door_features = ['Doors']
num_features = ['Odometer (KM)']
# 组合各个计算方法
# ColumnTransformer:(name, transformer, columns)
imputer = ColumnTransformer([('cat_imputer', cat_imputer, cat_features),
('door_imputer', door_imputer, door_features),
('num_imputer', num_imputer, num_features)])
# 填充训练和测试集
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.fit_transform(X_test)
# 检查填充的训练集
filled_X_train
array([['Honda', 'White', 4.0, 71934.0],
['Toyota', 'Red', 4.0, 162665.0],
['Honda', 'White', 4.0, 42844.0],
...,
['Toyota', 'White', 4.0, 196225.0],
['Honda', 'Blue', 4.0, 133117.0],
['Honda', 'missing', 4.0, 150582.0]], dtype=object)
# 将array格式转化成dataframe
car_sales_filled_train = pd.DataFrame(filled_X_train, columns = ["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled_test = pd.DataFrame(filled_X_test, columns = ["Make", "Colour", "Doors", "Odometer (KM)"])
# 检查训练集是否有缺失值
car_sales_filled_train.isnull().sum()
Make 0
Colour 0
Doors 0
Odometer (KM) 0
dtype: int64
# 现在让我们把数据全部变成numeric
categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
# ColumnTransformer:(name, transformer, columns), 而remainder默认是'drop',用'passthrough'来保留未选特征
transformer = ColumnTransformer([('one_hot',
one_hot,
categorical_features)],
remainder = 'passthrough')
transformed_X_train = transformer.fit_transform(car_sales_filled_train)
transformed_X_test = transformer.fit_transform(car_sales_filled_test)
transformed_X_train.toarray()
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 7.19340e+04],
[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 1.62665e+05],
[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 4.28440e+04],
...,
[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 1.96225e+05],
[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 1.33117e+05],
[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 1.50582e+05]])
建模
# 现在用随机森林进行建模
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)
0.25366332156443805
总结
将前面的所有步骤总结在一起,包括缺失值处理,独热编码,建模处理,效果评估
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
np.random.seed(42)
data = pd.read_csv(r"\Users\Administrator\Desktop\data\car-sales-extended-missing-data.csv")
data.dropna(subset = ['Price'], inplace = True)
# 用管道把方法依次应用,代码更加简单,好用啊
categorical_features = ['Make', 'Colour']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])
door_feature = ['Doors']
door_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = 'constant', fill_value = 4))
])
numeric_features = ['Odometer (KM)']
numeric_transformer = Pipeline(steps =[
('imputer', SimpleImputer(strategy = 'mean'))
])
preprocessor = ColumnTransformer(
transformers = [
('cats', categorical_transformer, categorical_features),
('door', door_transformer, door_feature),
('num', numeric_transformer, numeric_features)
])
model = Pipeline(steps = [
('preprocessor', preprocessor),
('model', RandomForestRegressor())
])
X = data.drop('Price', axis = 1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model.fit(X_train, y_train)
model.score(X_test, y_test)
0.22188417408787875
后续任务:
- 评估方法的理解和完善,比如分类和回归中score各自的含义,各种评估指标的理解
- 数据集的复杂度需要提升,数据集量太少
- 探索性数据的可视化
- 特征工程的摸索