文章目录
一、 导入数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'svg'
bike_df0 = pd.read_csv('data/bike/train.csv')
bike_df0.info()
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 datetime 10886 non-null object
1 season 10886 non-null int64
2 holiday 10886 non-null int64
3 workingday 10886 non-null int64
4 weather 10886 non-null int64
5 temp 10886 non-null float64
6 atemp 10886 non-null float64
7 humidity 10886 non-null int64
8 windspeed 10886 non-null float64
9 casual 10886 non-null int64
10 registered 10886 non-null int64
11 count 10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB
"""
bike_df0.describe([0.01, 0.99])
二、 特征工程
2.1 类型转换
bike_df1 = bike_df0.copy()
def transform_datetime(df):
# 将datetime处理成datetime类型
df.datetime = pd.to_datetime(df.datetime)
# 分别得到year、hour特征
df['year'] = df.datetime.dt.year
df['hour'] = df.datetime.dt.hour
# 原来的datetime抛弃
df.drop(columns=['datetime'], inplace=True)
transform_datetime(bike_df1)
2.2 特征筛选
# 查看相关系数
plt.figure(figsize=(12, 8))
sns.heatmap(bike_df1.corr(), vmin=-1, cmap=plt.cm.coolwarm, annot=True)
# 丢弃atemp列、holiday列、windspeed列和count列
bike_df1.drop(columns=['atemp', 'holiday', 'windspeed', 'count'], inplace=True)
2.3 异常值处理
# 画图查看hour列、temp列、humidity列与count的关系
def show_img(df, col_name, value_name):
temp = pd.pivot_table(df, index=[col_name], values=[value_name], aggfunc='mean')
plt.figure(figsize=(6,4))
sns.lineplot(
x=col_name,
y=value_name,
data=temp
)
plt