logboxplot <- function(...)
{
boxplot(..., yaxt="n")
yticks <- c(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000)
axis(side=2, at=log10(yticks), label=yticks)
}
接上回流程概述第一部分,我们首先创造上述代码块中的函数用于构建对数箱式图。
par(mfrow=c(1,2))
logboxplot(log10(casual) ~ hour, data=subset(bikedata2011, casual > 0),
ylim=log10(c(1,500)), main="casual", ylab="number of hires",col="blue")
logboxplot(log10(registered) ~ hour, data=subset(bikedata2011, registered > 0),
ylim=log10(c(1,500)), main="registered", ylab="number of hires",col="blue")
随后我们通过以上代码块生成时间(小时)为横轴,租赁数目为纵轴的casual和registered两种租赁形式的对数箱式图。

casual用户在一天中的变化看起来类似一个平滑的正弦函数,其四分位数之间的范围也更加宽阔,casual用户的租用次数比registered用户的租用次数变化要大得多。就registered用户而言,白天有两个高峰,与人们上下班的高峰时段相对应。这表明registered用户的使用模式在工作日和周末可能有所不同。因此,我们对周末和工作日的使用频次做了一个区分。
par(mfrow=c(1,2))
bikedata2011 <- transform(bikedata2011,
weekend = weekday %in% c("Saturday", "Sunday"))
logboxplot(log10(registered) ~ hour,
data=subset(bikedata2011, registered > 0 & weekend), ylim=log10(c(1,500)),
main="weekend", ylab="registered hires",col="blue")
logboxplot(log10(registered) ~ hour,
data=subset(bikedata2011, registered > 0 & !weekend), ylim=log10(c(1,500)),
main="working week", ylab="registered hires",col="blue")

从这里我们可以看出,周末看不到高峰时段的峰值。这对我们的回归模型有重要影响。如果对registered用户而言,时间的影响取决于一个日期在一周内的位置(星期几),那么就不能用一个参数来概括每个时间的影响,我们需要在模型中加入一个交互项。
进一步的 EDA 需要将数据汇总,这样我们就可以查看每天而不是每小时的自行车租用数。我们汇总温度、湿度和风速变量,这些变量取每日24小时中的最大值。
rentals.daily <- aggregate(bikedata2011[,c("casual","registered","cnt")],
by=bikedata2011[,c("date", "season", "month", "weekday",
"weekend")],
FUN=sum)
weather.daily <- aggregate(bikedata2011[, c("temp", "atemp", "humidity", "windspeed")],
by=bikedata2011[,"date", drop=FALSE], FUN=max)
rentals.daily <- merge(rentals.daily, weather.daily)
新生成的数据框 rentals.daily 现在包含 365 行,即2011 年每天的一条记录。
>rentals.daily
date season month weekday weekend casual registered cnt temp atemp humidity windspeed
1 2011-01-01 winter January Saturday TRUE 331 654 985 0.46 0.4545 0.94 0.2985
2 2011-01-02 winter January Sunday TRUE 131 670 801 0.46 0.4545 1.00 0.4478
3 2011-01-03 winter January Monday FALSE 120 1229 1349 0.26 0.2576 0.69 0.4179
4 2011-01-04 winter January Tuesday FALSE 108 1454 1562 0.30 0.2879 0.74 0.3284
5 2011-01-05 winter January Wednesday FALSE 82 1518 1600 0.30 0.3182 0.74 0.3284
6 2011-01-06 winter January Thursday FALSE 88 1518 1606 0.28 0.2879 0.69 0.2836
7 2011-01-07 winter January Friday FALSE 148 1362 1510 0.22 0.2727 0.69 0.3284
8 2011-01-08 winter January Saturday TRUE 68 891 959 0.20 0.2424 0.93 0.5522
9 2011-01-09 winter January Sunday TRUE 54 768 822 0.22 0.1970 0.53 0.5224
10 2011-01-10 winter January Monday FALSE 41 1280 1321 0.20 0.2273 0.59 0.2985
11 2011-01-11 winter January Tuesday FALSE 43 1220 1263 0.20 0.2273 0.93 0.2239
12 2011-01-12 winter January Wednesday FALSE 25 1137 1162 0.22 0.1970 0.93 0.5821
13 2011-01-13 winter January Thursday FALSE 38 1368 1406 0.24 0.2121 0.59 0.4478
14 2011-01-14 winter January Friday FALSE 54 1367 1421 0.24 0.2576 0.74 0.3881
15 2011-01-15 winter January Saturday TRUE 222 1026 1248 0.34 0.3333 0.64 0.2985
16 2011-01-16 winter January Sunday TRUE 251 953 1204 0.28 0.3030 0.69 0.2985
17 2011-01-17 winter January Monday FALSE 117 883 1000 0.20 0.2121 0.93 0.2836
18 2011-01-18 winter January Tuesday FALSE 9 674 683 0.22 0.2727 0.93 0.3284
19 2011-01-19 winter January Wednesday FALSE 78 1572 1650 0.40 0.4091 0.93 0.4627
20 2011-01-20 winter January Thursday FALSE 83 1844 1927 0.32 0.3333 0.65 0.3881
21 2011-01-21 winter January Friday FALSE 75 1468 1543 0.24 0.2576 0.87 0.5821
22 2011-01-22 winter January Saturday TRUE 93 888 981 0.12 0.1970 0.57 0.3881
23 2011-01-23 winter January Sunday TRUE 150 836 986 0.16 0.1364 0.62 0.4627
24 2011-01-24 winter January Monday FALSE 86 1330 1416 0.16 0.1667 0.64 0.2537
25 2011-01-25 winter January Tuesday FALSE 186 1799 1985 0.32 0.3485 0.74 0.2836
26 2011-01-26 winter January Wednesday FALSE 34 472 506 0.24 0.2424 0.93 0.4627
27 2011-01-27 winter January Thursday FALSE 15 416 431 0.22 0.2424 0.93 0.3582
28 2011-01-28 winter January Friday FALSE 38 1129 1167 0.24 0.2727 0.93 0.3582
29 2011-01-29 winter January Saturday TRUE 123 975 1098 0.24 0.2879 0.80 0.3582
30 2011-01-30 winter January Sunday TRUE 140 956 1096 0.30 0.3333 0.93 0.2836
31 2011-01-31 winter January Monday FALSE 42 1459 1501 0.30 0.3182 0.69 0.3284
32 2011-02-01 winter February Tuesday FALSE 47 1313 1360 0.24 0.2879 0.93 0.1940
33 2011-02-02 winter February Wednesday FALSE 72 1454 1526 0.38 0.3939 1.00 0.5522
34 2011-02-03 winter February Thursday FALSE 61 1489 1550 0.22 0.2576 0.55 0.5224
35 2011-02-04 winter February Friday FALSE 88 1620 1708 0.30 0.2879 0.80 0.2537
36 2011-02-05 winter February Saturday TRUE 100 905 1005 0.30 0.2879 1.00 0.4478
37 2011-02-06 winter February Sunday TRUE 354 1269 1623 0.34 0.3636 0.70 0.4179
38 2011-02-07 winter February Monday FALSE 120 1592 1712 0.38 0.3939 1.00 0.1642
39 2011-02-08 winter February Tuesday FALSE 64 1466 1530 0.28 0.3182 0.93 0.5821
40 2011-02-09 winter February Wednesday FALSE 53 1552 1605 0.20 0.1970 0.86 0.3582
41 2011-02-10 winter February Thursday FALSE 47 1491 1538 0.20 0.2121 0.86 0.4179
42 2011-02-11 winter February Friday FALSE 149 1597 1746 0.32 0.3333 0.74 0.2537
43 2011-02-12 winter February Saturday TRUE 288 1184 1472 0.34 0.3182 0.93 0.5224
44 2011-02-13 winter February Sunday TRUE 397 1192 1589 0.42 0.4242 0.75 0.4627
45 2011-02-14 winter February Monday FALSE 208 1705 1913 0.60 0.5909 0.53 0.6567
46 2011-02-15 winter February Tuesday FALSE 140 1675 1815 0.34 0.3333 0.52 0.7761
47 2011-02-16 winter February Wednesday FALSE 218 1897 2115 0.46 0.4545 0.55 0.4179
48 2011-02-17 winter February Thursday FALSE 259 2216 2475 0.60 0.6212 0.70 0.3881
49 2011-02-18 winter February Friday FALSE 579 2348 2927 0.66 0.6212 0.77 0.5821
50 2011-02-19 winter February Saturday TRUE 532 1103 1635 0.48 0.4697 0.33 0.7463
51 2011-02-20 winter February Sunday TRUE 639 1173 1812 0.36 0.3636 0.61 0.5522
52 2011-02-21 winter February Monday FALSE 195 912 1107 0.42 0.4242 0.87 0.4478
53 2011-02-22 winter February Tuesday FALSE 74 1376 1450 0.24 0.2273 0.80 0.4179
54 2011-02-23 winter February Wednesday FALSE 139 1778 1917 0.34 0.3636 0.60 0.1940
55 2011-02-24 winter February Thursday FALSE 100 1707 1807 0.40 0.4091 0.93 0.3881
56 2011-02-25 winter February Friday FALSE 120 1341 1461 0.56 0.5303 1.00 0.8060
57 2011-02-26 winter February Saturday TRUE 424 1545 1969 0.36 0.3485 0.75 0.3582
58 2011-02-27 winter February Sunday TRUE 694 1708 2402 0.48 0.4697 0.87 0.2836
59 2011-02-28 winter February Monday FALSE 81 1365 1446 0.56 0.5303 1.00 0.6119
60 2011-03-01 winter March Tuesday FALSE 137 1714 1851 0.34 0.3636 0.70 0.4627
61 2011-03-02 winter March Wednesday FALSE 231 1903 2134 0.54 0.5152 0.75 0.5522
62 2011-03-03 winter March Thursday FALSE 123 1562 1685 0.26 0.3030 0.43 0.4925
63 2011-03-04 winter March Friday FALSE 214 1730 1944 0.36 0.3485 0.80 0.2985
64 2011-03-05 winter March Saturday TRUE 640 1437 2077 0.48 0.4697 1.00 0.3582
65 2011-03-06 winter March Sunday TRUE 114 491 605 0.46 0.4545 1.00 0.6119
66 2011-03-07 winter March Monday FALSE 244 1628 1872 0.34 0.3182 1.00 0.5821
67 2011-03-08 winter March Tuesday FALSE 316 1817 2133 0.38 0.3939 0.69 0.2239
68 2011-03-09 winter March Wednesday FALSE 191 1700 1891 0.36 0.3333 0.93 0.3881
69 2011-03-10 winter March Thursday FALSE 46 577 623 0.44 0.4394 0.00 0.5821
70 2011-03-11 winter March Friday FALSE 247 1730 1977 0.36 0.3485 1.00 0.3582
71 2011-03-12 winter March Saturday TRUE 724 1408 2132 0.46 0.4545 0.75 0.4925
72 2011-03-13 winter March Sunday TRUE 982 1435 2417 0.48 0.4697 0.76 0.4179
73 2011-03-14 winter March Monday FALSE 359 1687 2046 0.40 0.4091 0.70 0.3284
74 2011-03-15 winter March Tuesday FALSE 289 1767 2056 0.38 0.3939 0.87 0.3284
75 2011-03-16 winter March Wednesday FALSE 321 1871 2192 0.44 0.4394 1.00 0.3881
76 2011-03-17 winter March Thursday FALSE 424 2320 2744 0.52 0.5000 0.76 0.3284
77 2011-03-18 winter March Friday FALSE 884 2355 3239 0.70 0.6364 0.71 0.3582
78 2011-03-19 winter March Saturday TRUE 1424 1693 3117 0.60 0.6212 0.53 0.4925
79 2011-03-20 winter March Sunday TRUE 1047 1424 2471 0.42 0.4242 0.61 0.4179
80 2011-03-21 spring March Monday FALSE 401 1676 2077 0.58 0.5455 0.94 0.4179
81 2011-03-22 spring March Tuesday FALSE 460 2243 2703 0.50 0.4848 1.00 0.4627
82 2011-03-23 spring March Wednesday FALSE 203 1918 2121 0.40 0.4091 1.00 0.6418
83 2011-03-24 spring March Thursday FALSE 166 1699 1865 0.32 0.3030 1.00 0.3881
[ reached 'max' / getOption("max.print") -- omitted 282 rows ]
我们从 GGAlly 软件包中的 EDA 常用函数 ggpairs() 开始。
ggpairs(rentals.daily, columns=c("temp", "atemp", "humidity", "windspeed"))

这表明天气变量之间没有很强的相关性,只有temp和atemp 高度相关。这并不奇怪,因为它们是测量同一事物的两种方法:一种是客观的,一种是主观的。你不应该在模型中同时包含这两个变量。我选择保留 atemp(感觉温度),因为我认为租用自行车的决定更有可能受到对温度的主观感觉而非客观值的影响。需要注意的是,感觉温度通常包含在天气预报中,而不仅仅是个人的判断。
(未完待续)
文章讲述了如何使用R语言中的函数logboxplot创建对数箱式图,分析casual和registered两种自行车租赁量随时间的变化,并发现工作日和周末模式的不同。通过进一步的探索,作者发现天气变量如温度和湿度对租赁模式有影响,但atemp被选入模型作为温度变量。
3537

被折叠的 条评论
为什么被折叠?



