目标
分析San Francisco的犯罪案件的模式。数据来自:
https://data.sfgov.org/Public-Safety/SFPD-Incidents-Previous-Three-Months/tmnf-yvry?
希望回答以下问题:
- 在哪里停车最危险?
- SF最安全的地方是哪里? 每周的哪天/哪个时间最危险?
- 某种特定偷窃案件是否在某个区域更普遍?
准备分析用的包
我们用dplyr来整理数据,ggplot2以及ggmap来进行数据可视化
1
|
## Loading
required
package: ggplot2
|
1
|
## Loading
required
package: ggmap
|
1
|
## Loading
required
package: ggthemes
|
准备数据
先设置工作目录,读入数据。然后呢把日期格式化,同时呢我们把时间按照小时来分,不考虑分钟。
1
2
3
4
5
6
7
8
9
10
11
|
setwd(
"d:/project/datascience/teamleada")
crime=read.csv(
"./SFPD_Incidents_-_Previous_Three_Months.csv")
crime$Location=
NULL
crime$IncidntNum=
NULL
crime$Date=as.Date(crime$Date,format=
"%m/%d/%Y")
crime$Hour=as.factor(substr(as.character(crime$Time),
1,
2))
crime.df=tbl_df(crime)
crime.df
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
## Source: local data frame [30,760 x 11]
##
## Category Descript DayOfWeek
## 1 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO Sunday
## 2 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Sunday
## 3 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Sunday
## 4 DRUG/NARCOTIC POSSESSION OF METH-AMPHETAMINE Sunday
## 5 DRUG/NARCOTIC POSSESSION OF COCAINE Sunday
## 6 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Sunday
## 7 WARRANTS WARRANT ARREST Sunday
## 8 VEHICLE THEFT STOLEN AUTOMOBILE Sunday
## 9 LARCENY/THEFT PETTY THEFT OF PROPERTY Sunday
## 10 ROBBERY ROBBERY OF A CHAIN STORE WITH BODILY FORCE Sunday
## .. ... ... ...
## Variables not shown: Date (date), Time (fctr), PdDistrict (fctr),
## Resolution (fctr), Address (fctr), X (dbl), Y (dbl), Hour (fctr)
|
辅助函数
这里主要有两个,因为在画直方图的时候,我们希望按count大小来排序,而不是按照数据中的factor变量的level来排序,所以写了一个辅助函数来对factor的lvel重新排序
1
2
3
4
|
order.level=
function(level.var,count) {
level.var=factor(level.var,levels =
levels(level.var)[order(count,decreasing=
T)])
}
|
另一个函数就是控制多个图形绘制时的排版,这个直接使用的R cookbook提供了函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
multiplot <-
function(
..., plotlist=
NULL, file, cols=
1, layout=
NULL) {
require(grid)
plots <- c(list(
...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(
1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==
1) {
print(plots[[
1]])
}
else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i
in
1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind =
TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
|
在哪里停车?
这里我们先只取VEHICLE THEFT进行分析,之后按照案件发生的区域分组统计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
crime.df.vehicle=filter(crime.df,Category==
"VEHICLE THEFT")
crime.vehicle.district=crime.df.vehicle %>%
group_by(PdDistrict)%>%
summarise(count=n())
crime.vehicle.district$PdDistrict=order.level(crime.vehicle.district$PdDistrict,
crime.vehicle.district$count)
p1=qplot(crime.vehicle.district$PdDistrict,data=crime.vehicle.district,
weight=crime.vehicle.district$count,geom=
"histogram",
xlab=
"District",ylab=
"Vehicle Theft Count")
crime.df$lon=crime.df$X
crime.df$lat=crime.df$Y
SFMap <- qmap(
'San Francisco', zoom =
13, color =
'bw', legend =
'topleft')
|
1
2
3
4
|
#
# Map from URL : http:
#
# Google Maps API Terms of Service : http:
#
# Information from URL : http:
#
# Google Maps API Terms of Service : http:
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
p2=SFMap+
geom_density2d(data=crime.df.vehicle,aes(group=
1))+
stat_density2d(data=crime.df.vehicle,aes(group=
1,fill=..level..,
alpha=..level..),
size=
0.01,bins=
16,geom=
'polygon')+
scale_fill_gradient(low=
"green",high=
"red")+
scale_alpha(range=c(
0.00,
0.25),guide=
FALSE)+
theme(legend.position=
"none",axis.title=element_blank(),
text=element_text(size=
12))+
ggtitle(
"Vehicle Theft")
multiplot(p1, p2, cols=
1)
|
1
|
## Loading
required
package: grid
|
1
|
## Error:
object
'lon'
not found
|
VEHICLE THEFT
显然Ingleside,Mission,Bayview去案件更多,密度图也反映了相同信息
SF最安全的地方是哪里? 每周的哪天/哪个时间最危险?
先按照每个区计算案件数目
1
2
3
4
5
6
|
crime.by.district=crime.df %>% group_by(PdDistrict)%>%summarise(count=n())
crime.by.district$PdDistrict=order.level(crime.by.district$PdDistrict,
crime.by.district$count)
p1=qplot(crime.by.district$PdDistrict,data=crime.by.district,
weight=crime.by.district$count,geom=
"histogram",
xlab=
"District",ylab=
"# of Crimes",main=
"# of Crimes by District")
|
根据每天来统计
1
2
3
4
5
6
|
crime.by.day=crime.df %>% group_by(DayOfWeek)%>%summarise(count=n())
crime.by.day$DayOfWeek=order.level(crime.by.day$DayOfWeek,
crime.by.day$count)
p2=qplot(crime.by.day$DayOfWeek,data=crime.by.day,
weight=crime.by.day$count,geom=
"histogram",
xlab=
"Day of Week",ylab=
"# of Crimes",main=
"# of Crimes by Day")
|
按照犯罪时间统计
1
2
3
4
5
6
7
8
|
crime.by.time=crime.df %>% group_by(Hour)%>%summarise(count=n())
crime.by.time$Hour=order.level(crime.by.time$Hour,
crime.by.time$count)
p3=qplot(crime.by.time$Hour,data=crime.by.time,
weight=crime.by.time$count,geom=
"histogram",
xlab=
"Time",ylab=
"# of Crimes",main=
"# of Crimes by Time")
multiplot(p1, p2,p3 ,cols=
2)
|
犯罪区域,日期,时间分析
Richmond和Park区域是最安全的地方,案件什么少于了200.而案件发生的日期显然没有太明显规律,周一到周日基本都在4000左右。相反案件发生时间有高峰期和低点,18:00pm~19:00pm(2087),17:00~18:00pm(2022),19:00pm~20:00pm(1838)是典型的高峰,而04:00am~06:00am(<400)则是低点.
某种特定偷窃案件是否在某个区域更普遍?
1
2
3
4
5
6
7
8
9
10
11
12
13
|
theft.filter=grep(
"THEFT",as.character(crime.df$Category))
crime.by.theft=crime.df[theft.filter,]
crime.by.group=crime.by.theft %>% group_by(PdDistrict,Category)%>%summarise(count=n())%>%arrange(Category,desc(count))
crime.by.number=crime.by.theft%>%group_by(Category)%>%summarise(count=n())%>%arrange(desc(count))
ggplot(crime.by.group,
aes(x=Category,fill=PdDistrict,weight=count))+
geom_histogram(position=
"fill",col=gray(
0.2))+
ylab(label=
"Ratio by District")+
theme_tufte()
|
不同偷窃案件发生区域对比
larcen/theft 在Southern和 Central 区域更普遍,而vehicle theft则在Ingleside,Misson 和 Tenderloin区域更普遍。