R语言学习 day_4

  • 字符串处理

    1. 常用字符串函数:nchar(),substring(),paste().

      nchar(),可以显示字符串里面的字符数量

      substring(),可以截取字符串

      paste()可以拼贴字符串

      > charvector=c("1970 1,1003.2|4.11|6.21 Mio","1975|21,034.6|5.31|7.11 Mio")
      > charvector
      [1] "1970 1,1003.2|4.11|6.21 Mio" "1975|21,034.6|5.31|7.11 Mio"
      > nchar(charvector)
      [1] 27 27
      > years=substring(charvector,first = 1,last=4) #结构中first = 1,last=4,可以简写为1,4
      > years
      [1] "1970" "1975"
      > years=substring(charvector,1,4)
      > years
      [1] "1970" "1975"
      > paste(years,"12","31",sep="-") #sep表示以何种符号分割
      [1] "1970-12-31" "1975-12-31"
      > paste(years,"31","12",sep="-")
      [1] "1970-31-12" "1975-31-12"
      > #按输入字符的顺序paste
      
    2. 搜索字符串

      grep(),regexpr()

      grep(),可以搜索指定内容或者含有指定内容的字符串。

      #搜索字符串中的模式
      > #grep()
      > data(longley)
      > names=names(longley)
      > names
      [1] "GNP.deflator" "GNP"          "Unemployed"   "Armed.Forces" "Population"  
      [6] "Year"         "Employed"    
      > index=grep("GNP",names) #搜索含GNP字样的字符串
      > index
      [1] 1 2
      > names[index]
      [1] "GNP.deflator" "GNP"         
      > index=grep("^G",names) #搜索以G字母开头的字符串
      > index
      [1] 1 2
      #regexpr()判断向量names中的各元素是否含有字符串,有则返回位次,无则返回值-1,返回结果还有一些其他内容
      > gnpMatch=regexpr("GNP",names)
      > gnpMatch
      [1]  1  1 -1 -1 -1 -1 -1
      attr(,"match.length")
      [1]  3  3 -1 -1 -1 -1 -1
      attr(,"index.type")
      [1] "chars"
      attr(,"useBytes")
      [1] TRUE
      > index=1:length(names)
      > index[gnpMatch>0]
      [1] 1 2
      
    3. 字符串替换

      利用gsub()可以进行字符串替换

      #字符串替换
      > #gsub()
      > gsub(".Forces","_Forces",names)
      [1] "GNP.deflator" "GNP"          "Unemployed"   "Armed_Forces" "Population"  
      [6] "Year"         "Employed"
      
  • 因子处理

    1. cut()函数,对数据进行切片分组,cut返回的结果类型为 因子

      > data("longley")
      > gnp=longley$GNP
      > regimes=(2:6)*100;regimes
      [1] 200 300 400 500 600
      > cut(x=gnp,regimes) #不指定分组时,函数会默认分组
       [1] (200,300] (200,300] (200,300] (200,300] (300,400] (300,400] (300,400] (300,400]
       [9] (300,400] (400,500] (400,500] (400,500] (400,500] (500,600] (500,600] (500,600]
      Levels: (200,300] (300,400] (400,500] (500,600]
      > gnp
       [1] 234.289 259.426 258.054 284.599 328.975 346.999 365.385 363.112 397.469 419.180
      [11] 442.769 444.546 482.704 502.601 518.173 554.894
      > cut(x=gnp,breaks=3)
       [1] (234,341] (234,341] (234,341] (234,341] (234,341] (341,448] (341,448] (341,448]
       [9] (341,448] (341,448] (341,448] (341,448] (448,555] (448,555] (448,555] (448,555]
      Levels: (234,341] (341,448] (448,555]
      > cut(x=gnp,breaks=5)
       [1] (234,298] (234,298] (234,298] (234,298] (298,363] (298,363] (363,427] (363,427]
       [9] (363,427] (363,427] (427,491] (427,491] (427,491] (491,555] (491,555] (491,555]
      Levels: (234,298] (298,363] (363,427] (427,491] (491,555]
      > groups=cut(x=gnp,breaks=3,labels=c("low","median","high"));groups
       [1] low    low    low    low    low    median median median median median median
      [12] median high   high   high   high  
      Levels: low median high
      > groups=cut(x=gnp,breaks=3,labels=c("high","median","low"));groups #注意标签顺序,与排序方式有关。建议指定方式
       [1] high   high   high   high   high   median median median median median median
      [12] median low    low    low    low   
      Levels: high median low
      > 
      
    2. 利用factor()函数可以实现类型转换

      > #注意标签顺序,与排序方式有关。建议指定方式
      > #函数factor
      > x=1:6;x
      [1] 1 2 3 4 5 6
      > class(x)
      [1] "integer"
      > y=factor(x)
      > y
      [1] 1 2 3 4 5 6
      Levels: 1 2 3 4 5 6
      > class(y)
      [1] "factor"
      > x+y
      [1] NA NA NA NA NA NA
      Warning message:
      In Ops.factor(x, y) : ‘+’ not meaningful for factors
      
  • 日期序列对象的类型

    1. date类型

      包含日期数据,不包含时间和时区信息

      常用获取时间的方法就是Sys.Date()函数,返回当前系统的时间

      as.numeric(),unclass()可以将日期对象转换为它的内部形式

    2. POSIX类型

      名称来源于“UNIX可移植操作系统接口”(Portable Operation System Interface of UNIX)的缩写。

      两种类型:POSIXct(日历时间,calendar time),POSIXlt(列表时间,list time)

      POSIXct只需要一个双精度数值,而POSIXlt需要一个列表,前者更节省空间。

    ###date()
    > today<-Sys.Date()
    > today
    [1] "2022-07-27"
    > class(today)
    [1] "Date"
    > as.numeric(today)
    [1] 19200
    > ndate<-unclass(today)
    > ndate
    [1] 19200
    > as.Date(ndate,origin = '1970-01-01')
    #在R语言中,是以距离1970-01-01的天数储存数据
    [1] "2022-07-27"
    > class(ndate)='Date'
    #引号不可省
    > ndate
    [1] "2022-07-27"
    
    > ###POSIX类型
    > time_ct<-Sys.time()
    > time_ct
    [1] "2022-07-27 09:32:13 CST"
    > class(time_ct)
    [1] "POSIXct" "POSIXt" 
    > as.numeric(time_ct)
    [1] 1658885533
    > unclass(time_ct)
    [1] 1658885533
    > time_lt<-as.POSIXlt(time_ct)
    > time_ct
    [1] "2022-07-27 09:32:13 CST"
    > time_lt
    [1] "2022-07-27 09:32:13 CST"
    > class(time_ct)
    [1] "POSIXct" "POSIXt" 
    > class(time_lt)
    [1] "POSIXlt" "POSIXt" 
    > as.numeric(time_lt)
    [1] 1658885533
    > unclass(time_lt)
    $sec
    [1] 13.02043
    
    $min
    [1] 32
    
    $hour
    [1] 9
    
    $mday
    [1] 27
    
    $mon
    [1] 6
    
    $year
    [1] 122
    
    $wday
    [1] 3
    
    $yday
    [1] 207
    
    $isdst
    [1] 0
    
    $zone
    [1] "CST"
    
    $gmtoff
    [1] 28800
    
    attr(,"tzone")
    [1] ""    "CST" "CDT"
    > n_sec<-as.numeric(time_lt)
    > class(time_lt)
    [1] "POSIXlt" "POSIXt" 
    > class(n_sec)=c("POSIXct","POSIXt");n_sec;class(n_sec)
    [1] "2022-07-27 09:32:13 CST"
    [1] "POSIXct" "POSIXt" 
    
    
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值