import pandas as pd import matplotlib.pyplot as plt dir = './data/' train = pd.read_table(dir + 'train_20171215.txt',engine='python') test_A = pd.read_table(dir + 'test_A_20171225.txt',engine='python') sample_A = pd.read_table(dir + 'sample_A_20171225.txt',engine='python',header=None) sample_A.columns = ['date','day_of_week'] # 因为第一赛季只是预测与时间相关的cnt的数量 # 所以可以对数据以date和day_of_week进行数据合并 train = train.groupby(['date','day_of_week'],as_index=False).cnt.sum() # print(train) # plt.plot(train['day_of_week'],train['cnt'],'*') # plt.show() #观察星期约束下销量在时间轴上的分布图 # for i in range(7): # tmp = train[train['day_of_week']==i+1] # plt.subplot(7, 1, i+1) # plt.plot(tmp['date'],tmp['cnt'],'*') # plt.show() #筛选测试集和训练集 xx_train = train[train['date']<=756] xx_test = train[train['date']>756] print('test shape',xx_test.shape) print('train shape',xx_train.shape) # 方案零:均值大法(原始数据验证) from sklearn.metrics import mean_squared_error # 线下统计每周的均值数据,不加权 xx_train = xx_train.groupby(['day_of_week'],as_index=False).cnt.mean() xx_result = pd.merge(xx_test,xx_train,on=['day_of_week'],how='left') print('xx_result shape',xx_result.shape) print(xx_result) print(mean_squared_error(xx_result['cnt_x'],xx_result['cnt_y']))
python数据分析案例1
最新推荐文章于 2023-03-30 09:11:14 发布