共享单车项目

Aɢᴀɪɴ.gp

已于 2023-07-07 19:48:08 修改

阅读量209

点赞数

文章标签： python pandas 机器学习数据分析

于 2023-04-28 09:43:35 首次发布

本文链接：https://blog.csdn.net/2301_77569962/article/details/130418803

版权

# -*- coding: utf-8 -*-
# @Time    : 2023/4/25 1:24 下午
# @Author  : gp
# @File    :共享单车.py
# @Software: PyCharm

1.导包和数据处理

import datetime
import calendar
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# 终端将pandas数据全部展示
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# 加载数据集
data = pd.read_csv('train.csv')

# 查看数据
data.info()


# print(data.head())
# print(data.describe())


# 数据处理  2011-01-01 00:00:00
# 获取天的信息
def get_day(s):
    return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').day


def get_month(s):
    return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').month


def get_hour(s):
    return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').hour


# 新建列 day，month，hour
data['day'] = data['datetime'].map(get_day)
data['month'] = data['datetime'].map(get_month)
data['hour'] = data['datetime'].map(get_hour)

t = datetime.datetime.strptime('2011-01-01 00:00:00', '%Y-%m-%d %H:%M:%S').month
print(calendar.month_name[t])  # January
print(calendar.day_name[t])  # Tuesday


# 定义获取月份名字的函数
def creat_m_n(s):
    t = datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').month
    return calendar.month_name[t]


# 定义获取星期几的函数
def creat_d_n(s):
    t = datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').weekday()
    return calendar.day_name[t]


# 获取月份名字和星期的名字 添加到原始数据中
data['month_name'] = data['datetime'].map(creat_m_n)
data['day_name'] = data['datetime'].map(creat_d_n)

2.画图

# 画小时的统计图
sns.pointplot(data=data, x='hour', y='count')
plt.show()

图像效果：

# 时间段分段
def c_hour_type(s):
    if 0 <= s <= 6:
        return 1
    elif 7 <= s <= 10:
        return 2
    elif 11 <= s <= 15:
        return 3
    elif 16 <= s <= 20:
        return 4
    else:
        return 5
# 添加时间段字段
data['hour_type'] = data['hour'].map(c_hour_type)

sns.pointplot(data=data, x='hour_type', y='count')
plt.show()

sns.pointplot(data=data, x='month', y='count')
plt.show()

图像效果：

# 箱型图 ---> 数据存在异常值
sns.boxplot(
    data=data,
    y='count'
)
plt.show()

sns.boxplot(
    data=data,
    x='season',
    y='count'
)
plt.show()

图像效果：

# 获取骑行量数据
'''
获取指定单元格数据
loc:-----> 需要使用字段名或行名，对数据进行筛选切分  loc[:,'count']
iloc: ----> 需要使用字段索引或行索引，对数据进行筛选切分 iloc[:,11]
'''
dd_count = data.loc[:, 'count']
# dd_count= data.iloc[:,11]

# 计算骑行量均值
dd_count_mean = np.mean(dd_count)

# 计算骑行量标准差
dd_count_std = np.std(dd_count)

# 判断噪声的条件  数据点-均值超过该数据标准差的三倍
noie_opt = (dd_count - dd_count_mean) > 3 * dd_count_std
print(noie_opt)
'''
0        False
1        False
2        False
3        False
4        False
         ...  
10881    False
10882    False
10883    False
10884    False
10885    False
'''
# print(type(~noie_opt)) # <class 'pandas.core.series.Series'>
noie_opt_v = noie_opt.values.flatten()
# print(noie_opt_v)


# 从数据集中筛选出非噪声数据
# ～ 取反
dd_data_good = data.loc[~noie_opt_v, :]
dd_data_good.info()

# 绘制热力图
heat_map_list = ['count', 'weather', 'temp', 'atemp', 'windspeed', 'casual', 'registered']

# 取出上面字段的数据
dd_heatmap = dd_data_good.loc[:, heat_map_list]
# 获取皮尔逊相关系数
dd_heatmap_corr = dd_heatmap.corr()
sns.heatmap(
    data=dd_heatmap_corr,
    annot=True
)
plt.show()

图像效果：

# 分析不同季节对骑行量的影响
# 按照'季节'字段'season' 进行分组，统计所有字段的平均值

season_gd = dd_data_good.groupby('season').mean()
print(season_gd)
'''
         holiday  workingday   weather  ...      month       hour  hour_type
season                                  ...                                 
1       0.026473    0.680089  1.425056  ...   2.004847  11.633482   2.736018
2       0.017817    0.688196  1.427246  ...   4.998886  11.426503   2.692279
3       0.036036    0.667417  1.368619  ...   7.993994  11.383634   2.682432
4       0.035569    0.671360  1.461282  ...  11.011856  11.473138   2.702112
'''
# 重置索引
season_gd_good = season_gd.reset_index()
print(season_gd_good)
'''
   season   holiday  workingday  ...      month       hour  hour_type
0       1  0.026473    0.680089  ...   2.004847  11.633482   2.736018
1       2  0.017817    0.688196  ...   4.998886  11.426503   2.692279
2       3  0.036036    0.667417  ...   7.993994  11.383634   2.682432
3       4  0.035569    0.671360  ...  11.011856  11.473138   2.702112
'''
# 季节对骑行量的影响
sns.barplot(
    data=season_gd_good,
    x='season',
    y='count'
)
plt.show()

图像效果：

# 按照'season','hour' 字段进行分组统计，并画图
season_gd = dd_data_good.groupby(by=['season', 'hour']).mean().reset_index()
sns.barplot(
    data=season_gd,
    x='hour',
    y='count',
    hue='season'
)
plt.show()

图像效果：

3.特征工程

'''---------------'''
# 特征工程  ------> 处理特征 x
# 查验数据
# dd_data_good.info()
# 删除冗余数据 datetime  month hour
dd_data_good.drop('datetime', inplace=True, axis=1)
dd_data_good.drop('month', inplace=True, axis=1)
dd_data_good.drop('hour', inplace=True, axis=1)
# dd_data_good.info()

# 独热处理 ----> 升维----> 过拟合
'''
离散型：人为定义的数据 ，都是自然数，可数的 obj/int64 ------> 独热
连续型：在某一个区间之内，任意数字，[35。0，37。2]   ------> 特征缩放/标准化/归一化
'''
dd_data_good = pd.get_dummies(data=dd_data_good, columns=['season', 'month_name', 'day_name', 'weather', 'hour_type'])

# 特征缩放
from sklearn.preprocessing import StandardScaler

st_list = ['temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered']
for i in st_list:
    dd_data_good[i] = StandardScaler().fit_transform(dd_data_good[[i]])
    '''
     dd_data_good[i] ------> 一维数据
     标准化----> 处理二维数据   dd_data_good[[i]]
    '''

4.机器学习


from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV

#  切分特征和标签 count----> 标签   预测什么，依据什么分类
y = dd_data_good.pop('count')  # pop 将指定数据取出，愿数据中没有这个数据了，做为变量返回了
x = dd_data_good
# print(type(x)) # <class 'pandas.core.frame.DataFrame'>
# DataFrame  数据类型转化为np类型，更适合机器学习
x_arr = np.array(x)
y_arr = np.array(y)

# 切分训练集和测试集 默认切分比例 7.5:2.5
train_x, test_x, train_y, test_y = train_test_split(x_arr, y_arr, test_size=0.25)

# 创建对象
lr = Ridge()

# 网格搜索交叉验证----> 找最优参数
model = GridSearchCV(lr, param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.002]}, cv=5)
model.fit(train_x, train_y)
print(model.best_params_)
print(model.best_score_)

# 使用最参数重新训练模型
lr = Ridge(alpha=model.best_params_['alpha'])
lr.fit(train_x, train_y)
# 预测
y_pre = lr.predict(test_x)

# 模型评分
print(lr.score(train_x, train_y))
print(lr.score(test_x, test_y))

# 模型评估
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print('r2', r2_score(test_y, y_pre))
print('mse', mean_squared_error(test_y, y_pre))
print('mae', mean_absolute_error(test_y, y_pre))