-
字符串处理
-
常用字符串函数:nchar(),substring(),paste().
nchar(),可以显示字符串里面的字符数量
substring(),可以截取字符串
paste()可以拼贴字符串
> charvector=c("1970 1,1003.2|4.11|6.21 Mio","1975|21,034.6|5.31|7.11 Mio") > charvector [1] "1970 1,1003.2|4.11|6.21 Mio" "1975|21,034.6|5.31|7.11 Mio" > nchar(charvector) [1] 27 27 > years=substring(charvector,first = 1,last=4) #结构中first = 1,last=4,可以简写为1,4 > years [1] "1970" "1975" > years=substring(charvector,1,4) > years [1] "1970" "1975" > paste(years,"12","31",sep="-") #sep表示以何种符号分割 [1] "1970-12-31" "1975-12-31" > paste(years,"31","12",sep="-") [1] "1970-31-12" "1975-31-12" > #按输入字符的顺序paste
-
搜索字符串
grep(),regexpr()
grep(),可以搜索指定内容或者含有指定内容的字符串。
#搜索字符串中的模式 > #grep() > data(longley) > names=names(longley) > names [1] "GNP.deflator" "GNP" "Unemployed" "Armed.Forces" "Population" [6] "Year" "Employed" > index=grep("GNP",names) #搜索含GNP字样的字符串 > index [1] 1 2 > names[index] [1] "GNP.deflator" "GNP" > index=grep("^G",names) #搜索以G字母开头的字符串 > index [1] 1 2 #regexpr()判断向量names中的各元素是否含有字符串,有则返回位次,无则返回值-1,返回结果还有一些其他内容 > gnpMatch=regexpr("GNP",names) > gnpMatch [1] 1 1 -1 -1 -1 -1 -1 attr(,"match.length") [1] 3 3 -1 -1 -1 -1 -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE > index=1:length(names) > index[gnpMatch>0] [1] 1 2
-
字符串替换
利用gsub()可以进行字符串替换
#字符串替换 > #gsub() > gsub(".Forces","_Forces",names) [1] "GNP.deflator" "GNP" "Unemployed" "Armed_Forces" "Population" [6] "Year" "Employed"
-
-
因子处理
-
cut()函数,对数据进行切片分组,cut返回的结果类型为 因子
> data("longley") > gnp=longley$GNP > regimes=(2:6)*100;regimes [1] 200 300 400 500 600 > cut(x=gnp,regimes) #不指定分组时,函数会默认分组 [1] (200,300] (200,300] (200,300] (200,300] (300,400] (300,400] (300,400] (300,400] [9] (300,400] (400,500] (400,500] (400,500] (400,500] (500,600] (500,600] (500,600] Levels: (200,300] (300,400] (400,500] (500,600] > gnp [1] 234.289 259.426 258.054 284.599 328.975 346.999 365.385 363.112 397.469 419.180 [11] 442.769 444.546 482.704 502.601 518.173 554.894 > cut(x=gnp,breaks=3) [1] (234,341] (234,341] (234,341] (234,341] (234,341] (341,448] (341,448] (341,448] [9] (341,448] (341,448] (341,448] (341,448] (448,555] (448,555] (448,555] (448,555] Levels: (234,341] (341,448] (448,555] > cut(x=gnp,breaks=5) [1] (234,298] (234,298] (234,298] (234,298] (298,363] (298,363] (363,427] (363,427] [9] (363,427] (363,427] (427,491] (427,491] (427,491] (491,555] (491,555] (491,555] Levels: (234,298] (298,363] (363,427] (427,491] (491,555] > groups=cut(x=gnp,breaks=3,labels=c("low","median","high"));groups [1] low low low low low median median median median median median [12] median high high high high Levels: low median high > groups=cut(x=gnp,breaks=3,labels=c("high","median","low"));groups #注意标签顺序,与排序方式有关。建议指定方式 [1] high high high high high median median median median median median [12] median low low low low Levels: high median low >
-
利用factor()函数可以实现类型转换
> #注意标签顺序,与排序方式有关。建议指定方式 > #函数factor > x=1:6;x [1] 1 2 3 4 5 6 > class(x) [1] "integer" > y=factor(x) > y [1] 1 2 3 4 5 6 Levels: 1 2 3 4 5 6 > class(y) [1] "factor" > x+y [1] NA NA NA NA NA NA Warning message: In Ops.factor(x, y) : ‘+’ not meaningful for factors
-
-
日期序列对象的类型
-
date类型
包含日期数据,不包含时间和时区信息
常用获取时间的方法就是Sys.Date()函数,返回当前系统的时间
as.numeric(),unclass()可以将日期对象转换为它的内部形式
-
POSIX类型
名称来源于“UNIX可移植操作系统接口”(Portable Operation System Interface of UNIX)的缩写。
两种类型:POSIXct(日历时间,calendar time),POSIXlt(列表时间,list time)
POSIXct只需要一个双精度数值,而POSIXlt需要一个列表,前者更节省空间。
###date() > today<-Sys.Date() > today [1] "2022-07-27" > class(today) [1] "Date" > as.numeric(today) [1] 19200 > ndate<-unclass(today) > ndate [1] 19200 > as.Date(ndate,origin = '1970-01-01') #在R语言中,是以距离1970-01-01的天数储存数据 [1] "2022-07-27" > class(ndate)='Date' #引号不可省 > ndate [1] "2022-07-27" > ###POSIX类型 > time_ct<-Sys.time() > time_ct [1] "2022-07-27 09:32:13 CST" > class(time_ct) [1] "POSIXct" "POSIXt" > as.numeric(time_ct) [1] 1658885533 > unclass(time_ct) [1] 1658885533 > time_lt<-as.POSIXlt(time_ct) > time_ct [1] "2022-07-27 09:32:13 CST" > time_lt [1] "2022-07-27 09:32:13 CST" > class(time_ct) [1] "POSIXct" "POSIXt" > class(time_lt) [1] "POSIXlt" "POSIXt" > as.numeric(time_lt) [1] 1658885533 > unclass(time_lt) $sec [1] 13.02043 $min [1] 32 $hour [1] 9 $mday [1] 27 $mon [1] 6 $year [1] 122 $wday [1] 3 $yday [1] 207 $isdst [1] 0 $zone [1] "CST" $gmtoff [1] 28800 attr(,"tzone") [1] "" "CST" "CDT" > n_sec<-as.numeric(time_lt) > class(time_lt) [1] "POSIXlt" "POSIXt" > class(n_sec)=c("POSIXct","POSIXt");n_sec;class(n_sec) [1] "2022-07-27 09:32:13 CST" [1] "POSIXct" "POSIXt"
-
R语言学习 day_4
于 2022-07-27 09:52:26 首次发布