San Francisco过去3月的犯罪案件可视化分析

最新推荐文章于 2022-12-28 12:28:10 发布

RoQuant

最新推荐文章于 2022-12-28 12:28:10 发布

阅读量1.9k

点赞数

分类专栏： R

R 专栏收录该内容

421 篇文章 15 订阅

订阅专栏

目标

分析San Francisco的犯罪案件的模式。数据来自:
https://data.sfgov.org/Public-Safety/SFPD-Incidents-Previous-Three-Months/tmnf-yvry?

希望回答以下问题:

在哪里停车最危险?
SF最安全的地方是哪里? 每周的哪天/哪个时间最危险?
某种特定偷窃案件是否在某个区域更普遍?

准备分析用的包

我们用dplyr来整理数据，ggplot2以及ggmap来进行数据可视化

1	require(dplyr)

1	require(ggplot2)

1	## Loading required package: ggplot2

1	require(ggmap)

1	## Loading required package: ggmap

1	require(ggthemes)

1	## Loading required package: ggthemes

准备数据

先设置工作目录，读入数据。然后呢把日期格式化，同时呢我们把时间按照小时来分，不考虑分钟。

     
     
      
      setwd(
      
      "d:/project/datascience/teamleada")
     
     
     
     
      
      #read in the data
     
     
     
     
      
      crime=read.csv(
      
      "./SFPD_Incidents_-_Previous_Three_Months.csv")
     
     
     
     
      
      # format the data
     
     
     
     
      
      crime$Location=
      
      NULL
     
     
     
     
      
      crime$IncidntNum=
      
      NULL
     
     
     
     
      
      crime$Date=as.Date(crime$Date,format=
      
      "%m/%d/%Y")
     
     
     
     
      
      crime$Hour=as.factor(substr(as.character(crime$Time),
      
      1,
      
      2))
     
     
     
     
      
      # we will use dplyr package to do the work
     
     
     
     
      
      crime.df=tbl_df(crime)
     
     
     
     
      
      crime.df

     
     
      
      ## Source: local data frame [30,760 x 11]
     
     
     
     
      
      ## 
     
     
     
     
      
      ##         Category                                   Descript DayOfWeek
     
     
     
     
      
      ## 1  LARCENY/THEFT             GRAND THEFT FROM UNLOCKED AUTO    Sunday
     
     
     
     
      
      ## 2  LARCENY/THEFT               GRAND THEFT FROM LOCKED AUTO    Sunday
     
     
     
     
      
      ## 3  LARCENY/THEFT               GRAND THEFT FROM LOCKED AUTO    Sunday
     
     
     
     
      
      ## 4  DRUG/NARCOTIC             POSSESSION OF METH-AMPHETAMINE    Sunday
     
     
     
     
      
      ## 5  DRUG/NARCOTIC                      POSSESSION OF COCAINE    Sunday
     
     
     
     
      
      ## 6  LARCENY/THEFT               GRAND THEFT FROM LOCKED AUTO    Sunday
     
     
     
     
      
      ## 7       WARRANTS                             WARRANT ARREST    Sunday
     
     
     
     
      
      ## 8  VEHICLE THEFT                          STOLEN AUTOMOBILE    Sunday
     
     
     
     
      
      ## 9  LARCENY/THEFT                    PETTY THEFT OF PROPERTY    Sunday
     
     
     
     
      
      ## 10       ROBBERY ROBBERY OF A CHAIN STORE WITH BODILY FORCE    Sunday
     
     
     
     
      
      ## ..           ...                                        ...       ...
     
     
     
     
      
      ## Variables not shown: Date (date), Time (fctr), PdDistrict (fctr),
     
     
     
     
      
      ##   Resolution (fctr), Address (fctr), X (dbl), Y (dbl), Hour (fctr)

辅助函数

这里主要有两个，因为在画直方图的时候，我们希望按count大小来排序，而不是按照数据中的factor变量的level来排序，所以写了一个辅助函数来对factor的lvel重新排序

     
     
      
      order.level=
      
      function(level.var,count) {
     
     
     
     
      
        level.var=factor(level.var,levels = 
     
     
     
     
      
                           levels(level.var)[order(count,decreasing=
      
      T)])
     
     
     
     
      
      }

另一个函数就是控制多个图形绘制时的排版，这个直接使用的R cookbook提供了函数

     
     
      
      #
     
     
     
     
      
      # Multiple plot function
     
     
     
     
      
      #
     
     
     
     
      
      # ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
     
     
     
     
      
      # - cols:   Number of columns in layout
     
     
     
     
      
      # - layout: A matrix specifying the layout. If present, 'cols' is ignored.
     
     
     
     
      
      #
     
     
     
     
      
      # If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
     
     
     
     
      
      # then plot 1 will go in the upper left, 2 will go in the upper right, and
     
     
     
     
      
      # 3 will go all the way across the bottom.
     
     
     
     
      
      #
     
     
     
     
      
      multiplot <- 
      
      function(
      
      ..., plotlist=
      
      NULL, file, cols=
      
      1, layout=
      
      NULL) {
     
     
     
       
      
      require(grid)
     
     
     
       
     
     
     
       
      
      # Make a list from the ... arguments and plotlist
     
     
     
     
      
        plots <- c(list(
      
      ...), plotlist)
     
     
     
       
     
     
     
     
      
        numPlots = length(plots)
     
     
     
       
     
     
     
       
      
      # If layout is NULL, then use 'cols' to determine layout
     
     
     
       
      
      if (is.null(layout)) {
     
     
     
         
      
      # Make the panel
     
     
     
         
      
      # ncol: Number of columns of plots
     
     
     
         
      
      # nrow: Number of rows needed, calculated from # of cols
     
     
     
     
      
          layout <- matrix(seq(
      
      1, cols * ceiling(numPlots/cols)),
     
     
     
     
      
                           ncol = cols, nrow = ceiling(numPlots/cols))
     
     
     
     
      
        }
     
     
     
       
     
     
     
       
      
      if (numPlots==
      
      1) {
     
     
     
     
      
          print(plots[[
      
      1]])
     
     
     
         
     
     
     
     
      
        } 
      
      else {
     
     
     
         
      
      # Set up the page
     
     
     
     
      
          grid.newpage()
     
     
     
     
      
          pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
     
     
     
         
     
     
     
         
      
      # Make each plot, in the correct location
     
     
     
         
      
      for (i 
      
      in 
      
      1:numPlots) {
     
     
     
           
      
      # Get the i,j matrix positions of the regions that contain this subplot
     
     
     
     
      
            matchidx <- as.data.frame(which(layout == i, arr.ind = 
      
      TRUE))
     
     
     
           
     
     
     
     
      
            print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
     
     
     
     
      
                                            layout.pos.col = matchidx$col))
     
     
     
     
      
          }
     
     
     
     
      
        }
     
     
     
     
      
      }

在哪里停车?

这里我们先只取VEHICLE THEFT进行分析，之后按照案件发生的区域分组统计

     
     
      
      crime.df.vehicle=filter(crime.df,Category==
      
      "VEHICLE THEFT")
     
     
     
     
      
      crime.vehicle.district=crime.df.vehicle %>% 
     
     
     
     
      
        group_by(PdDistrict)%>% 
     
     
     
     
      
        summarise(count=n())
     
     
     
     
     
     
      
      # PdDistrict变量按照count进行排序
     
     
     
     
      
      crime.vehicle.district$PdDistrict=order.level(crime.vehicle.district$PdDistrict,
     
     
     
     
      
                                                    crime.vehicle.district$count)
     
     
     
     
      
      p1=qplot(crime.vehicle.district$PdDistrict,data=crime.vehicle.district,
     
     
     
     
      
            weight=crime.vehicle.district$count,geom=
      
      "histogram",
     
     
     
     
      
            xlab=
      
      "District",ylab=
      
      "Vehicle Theft Count")
     
     
     
     
     
     
     
     
      
      # 经纬度
     
     
     
     
      
      crime.df$lon=crime.df$X
     
     
     
     
      
      crime.df$lat=crime.df$Y
     
     
     
     
     
     
      
      # 获得San Francisco地图
     
     
     
     
      
      SFMap <- qmap(
      
      'San Francisco', zoom = 
      
      13, color = 
      
      'bw', legend = 
      
      'topleft')

     
     
      
      #
      
      # Map from URL : http:
      
      //maps.googleapis.com/maps/api/staticmap?center=San+Francisco&zoom=13&size=%20640x640&scale=%202&maptype=terrain&sensor=false
     
     
     
     
      
      #
      
      # Google Maps API Terms of Service : http:
      
      //developers.google.com/maps/terms
     
     
     
     
      
      #
      
      # Information from URL : http:
      
      //maps.googleapis.com/maps/api/geocode/json?address=San+Francisco&sensor=false
     
     
     
     
      
      #
      
      # Google Maps API Terms of Service : http:
      
      //developers.google.com/maps/terms

     
     
      
      # 绘制密度图
     
     
     
     
      
      p2=SFMap+
     
     
     
     
      
        geom_density2d(data=crime.df.vehicle,aes(group=
      
      1))+
     
     
     
     
      
        stat_density2d(data=crime.df.vehicle,aes(group=
      
      1,fill=..level..,
     
     
     
     
      
                                                 alpha=..level..),
     
     
     
     
      
                       size=
      
      0.01,bins=
      
      16,geom=
      
      'polygon')+
     
     
     
     
      
        scale_fill_gradient(low=
      
      "green",high=
      
      "red")+
     
     
     
     
      
        scale_alpha(range=c(
      
      0.00,
      
      0.25),guide=
      
      FALSE)+
     
     
     
     
      
        theme(legend.position=
      
      "none",axis.title=element_blank(),
     
     
     
     
      
              text=element_text(size=
      
      12))+
     
     
     
     
      
        ggtitle(
      
      "Vehicle Theft")
     
     
     
     
     
     
     
     
      
      multiplot(p1, p2,  cols=
      
      1)

1	## Loading required package: grid

1	## Error: object 'lon' not found

VEHICLE THEFT

显然Ingleside,Mission,Bayview去案件更多，密度图也反映了相同信息

SF最安全的地方是哪里? 每周的哪天/哪个时间最危险?

先按照每个区计算案件数目

     
     
      
      crime.by.district=crime.df %>% group_by(PdDistrict)%>%summarise(count=n())
     
     
     
     
      
      crime.by.district$PdDistrict=order.level(crime.by.district$PdDistrict,
     
     
     
     
      
                                               crime.by.district$count)
     
     
     
     
      
      p1=qplot(crime.by.district$PdDistrict,data=crime.by.district,
     
     
     
     
      
            weight=crime.by.district$count,geom=
      
      "histogram",
     
     
     
     
      
            xlab=
      
      "District",ylab=
      
      "# of Crimes",main=
      
      "# of Crimes by District")

根据每天来统计

     
     
      
      crime.by.day=crime.df %>% group_by(DayOfWeek)%>%summarise(count=n())
     
     
     
     
      
      crime.by.day$DayOfWeek=order.level(crime.by.day$DayOfWeek,
     
     
     
     
      
                                         crime.by.day$count)
     
     
     
     
      
      p2=qplot(crime.by.day$DayOfWeek,data=crime.by.day,
     
     
     
     
      
            weight=crime.by.day$count,geom=
      
      "histogram",
     
     
     
     
      
            xlab=
      
      "Day of Week",ylab=
      
      "# of Crimes",main=
      
      "# of Crimes by Day")

按照犯罪时间统计

     
     
      
      crime.by.time=crime.df %>% group_by(Hour)%>%summarise(count=n())
     
     
     
     
      
      crime.by.time$Hour=order.level(crime.by.time$Hour,
     
     
     
     
      
                                     crime.by.time$count)
     
     
     
     
      
      p3=qplot(crime.by.time$Hour,data=crime.by.time,
     
     
     
     
      
            weight=crime.by.time$count,geom=
      
      "histogram",
     
     
     
     
      
            xlab=
      
      "Time",ylab=
      
      "# of Crimes",main=
      
      "# of Crimes by Time")
     
     
     
     
     
     
      
      multiplot(p1, p2,p3  ,cols=
      
      2)

犯罪区域，日期，时间分析

Richmond和Park区域是最安全的地方，案件什么少于了200.而案件发生的日期显然没有太明显规律，周一到周日基本都在4000左右。相反案件发生时间有高峰期和低点，18:00pm~19:00pm(2087),17:00~18:00pm(2022),19:00pm~20:00pm(1838)是典型的高峰，而04:00am~06:00am(<400)则是低点.

某种特定偷窃案件是否在某个区域更普遍?

     
     
      
      theft.filter=grep(
      
      "THEFT",as.character(crime.df$Category))
     
     
     
     
      
      crime.by.theft=crime.df[theft.filter,]
     
     
     
     
     
     
      
      # get the crime by district and category
     
     
     
     
      
      crime.by.group=crime.by.theft %>% group_by(PdDistrict,Category)%>%summarise(count=n())%>%arrange(Category,desc(count))
     
     
     
     
      
      # sum the count by crime category
     
     
     
     
      
      crime.by.number=crime.by.theft%>%group_by(Category)%>%summarise(count=n())%>%arrange(desc(count))
     
     
     
     
     
     
      
      ggplot(crime.by.group,
     
     
     
     
      
             aes(x=Category,fill=PdDistrict,weight=count))+
     
     
     
     
      
        geom_histogram(position=
      
      "fill",col=gray(
      
      0.2))+
     
     
     
     
      
        ylab(label=
      
      "Ratio by District")+
     
     
     
     
      
        theme_tufte()