目录
(2) 如何把字符转化成POSIXct/POSIXlt时间格式
#### 电影数据 ####
rm(list = ls())
movie = read.csv("电影数据.csv", header = T, fileEncoding = "UTF-8")
head(movie)
## name boxoffice doubanscore type duration showtime
## 1 叶问3 77060.44 6.4 动作 105 2016/3/4
## 2 美人鱼 338583.26 6.9 喜剧 93 2016/2/8
## 3 女汉子真爱公式 6184.45 4.5 喜剧 93 2016/3/18
## 4 西游记之孙悟空三打白骨精 119956.51 5.7 喜剧 120 2016/2/8
## 5 澳门风云三 111693.89 4.0 喜剧 112 2016/2/8
## 6 功夫熊猫3 99832.53 7.7 喜剧 95 2016/1/29
## director star1 index1 star2 index2
## 1 叶伟信 甄子丹 11385 张晋 4105
## 2 周星驰 邓超 41310 林允 9292
## 3 郭大雷 赵丽颖 181979 张翰 44277
## 4 郑保瑞 郭富城 12227 巩俐 8546
## 5 王晶 周润发 16731 刘德华 30277
## 6 吕寅荣 杰克布莱克 178 安吉丽娜朱莉 1540
1.数值型(numeric)
# 电影数据示例
class(movie$"boxoffice"); class(movie$doubanscore)
## [1] "numeric"
## [1] "numeric"
# 自己为变量赋一个数值
a = 2; class(a)
## [1] "numeric"
exp(1000) # 正无穷
## [1] Inf
-10 / 0 # 负无穷
## [1] -Inf
exp(1000) / exp(990) # NaN类型
## [1] NaN
exp(10)
## [1] 22026.47
2.字符型
# 字符的定义
a = "2"
class(a)
## [1] "character"
# 判断电影数据集中,变量“type","name"是不是字符型变量
class(movie$type)
## [1] "factor"
class(movie$name)
## [1] "factor"
3.逻辑型数据
# 读入数据时设置把字符数据保留,不转换为factor
movie = read.csv("电影数据.csv", header = T, stringsAsFactors = F, fileEncoding = "UTF-8")
movie$type[movie$name == "美人鱼"] == "喜剧"
## [1] TRUE
# 想在数据集中挑选大于7分的喜剧电影name?
movie$name[movie$type == "喜剧" & movie$"doubanscore" > 7]
## [1] "功夫熊猫3"
# 逻辑语句加减
(1 == 2) + (3 < 4)
## [1] 1
4.因子型数据
(1) 什么是因子型数据
(genders = factor(c("男", "女", "女", "男", "男")))
## [1] 男 女 女 男 男
## Levels: 男 女
(class = factor(c("Poor", "Improved", "Excellent"), ordered = T))
## [1] Poor Improved Excellent
## Levels: Excellent < Improved < Poor
(2) 如何改变因子型数据各水平的编码顺序
(class = factor(c("Poor", "Improved", "Excellent"), ordered = T,
levels = c("Poor", "Improved", "Excellent")))
## [1] Poor Improved Excellent
## Levels: Poor < Improved < Excellent
(3) 如何正确将因子型数据和字符型数据互相转化
R中的因子实际对应的是定性和定序分析,因此如果需要这两种类型的变量出现,可以考虑把字符型转变为因子型
# 输入原始字符变量
all = c("男", "女", "女", "男", "男")
# 将字符型变量变成因子型
gender = as.factor(all)
# 变换后的数据类型
is.factor(gender)
## [1] TRUE
class(gender)
## [1] "factor"
# 将因子型变量变成字符型
genders = as.character(gender)
# 变换后的数据类型
is.character(genders)
## [1] TRUE
class(genders)
## [1] "character"
5.时间类数据
(1) 如何把字符转化成Date日期格式
# 函数head用来查看数据前6个元素,函数class用来查看对象数据类型
head(movie$showtime)
## [1] "2016/3/4" "2016/2/8" "2016/3/18" "2016/2/8" "2016/2/8" "2016/1/29"
class(movie$showtime)
## [1] "character"
movie$showtime = as.Date(movie$showtime)
head(movie$showtime)
## [1] "2016-03-04" "2016-02-08" "2016-03-18" "2016-02-08" "2016-02-08"
## [6] "2016-01-29"
class(movie$showtime)
## [1] "Date"
# 参数format对应错了有什么后果
Sys.setlocale("LC_TIME", "C")
## [1] "C"
x = c("1jan1960", "2jan1960", "31mar1960", "30jul1960")
# y = as.Date(x)
(y = as.Date(x, format = "%d%b%Y"))
## [1] "1960-01-01" "1960-01-02" "1960-03-31" "1960-07-30"
(2) 如何把字符转化成POSIXct/POSIXlt时间格式
as.POSIXct("2015-11-27 01:30:00")
## [1] "2015-11-27 01:30:00 CST"
# as.POSIXct("November-27-2015 01:30:00")
as.POSIXct("November-27-2015 01:30:00", format = "%B-%d-%Y %H:%M:%S")
## [1] "2015-11-27 01:30:00 CST"
(3) 如何把时间数据摆弄成你想要的形式
(m = head(movie$showtime)) # 原始日期数据
## [1] "2016-03-04" "2016-02-08" "2016-03-18" "2016-02-08" "2016-02-08"
## [6] "2016-01-29"
format(m, format = "%B %d %Y") # 改成月日年的格式
## [1] "March 04 2016" "February 08 2016" "March 18 2016"
## [4] "February 08 2016" "February 08 2016" "January 29 2016"
format(m, format = "%B %d %Y %A") # 加入星期信息
## [1] "March 04 2016 Friday" "February 08 2016 Monday"
## [3] "March 18 2016 Friday" "February 08 2016 Monday"
## [5] "February 08 2016 Monday" "January 29 2016 Friday"
format(m, format = "%B") # 只提取出月份信息
## [1] "March" "February" "March" "February" "February" "January"
Sys.time() # 输出系统时间
## [1] "2018-07-31 19:46:14 CST"
class(Sys.time()) # 查看时间类型
## [1] "POSIXct" "POSIXt"
format(Sys.time(), format = "%B %d %Y") # 提取部分时间信息
## [1] "July 31 2018"
format(Sys.time(), format = "%Y/%B/%a %H:%M:%S") # 提取部分时间信息
## [1] "2018/July/Tue 19:46:14"
(4) 一款处理时间数据的专用包lubridate
# install.packages(lubridate)
library(lubridate)
x = c(20090101, "2009-01-02", "2009 01 03", "2009-1-4", "2009-1,5", "Created on 2009 1 6", "200901 !!! 07")
ymd(x)
## [1] "2009-01-01" "2009-01-02" "2009-01-03" "2009-01-04" "2009-01-05"
## [6] "2009-01-06" "2009-01-07"
mday(as.Date("2015-11-20"))
## [1] 20
wday(as.Date("2015-11-20"))
## [1] 6
hour(as.POSIXct("2015-11-20 01:30:00"))
## [1] 1
minute(as.POSIXct("2015-11-20 01:30:00"))
## [1] 30
(5) 时间类数据的操作
做差
# 求任意两个日期距离的天数
begin = as.Date("2016-03-04")
end = as.Date("2016-05-08")
(during = end - begin)
## Time difference of 65 days
# 求任意两个日期距离的周数和小时数
difftime(end, begin, units = "weeks")
## Time difference of 9.285714 weeks
difftime(end, begin, units = "hours")
## Time difference of 1560 hours
排序
# 单独对时间进行排序
head(movie$showtime)
## [1] "2016-03-04" "2016-02-08" "2016-03-18" "2016-02-08" "2016-02-08"
## [6] "2016-01-29"
head(sort(movie$showtime))
## [1] "2016-01-29" "2016-02-08" "2016-02-08" "2016-02-08" "2016-03-04"
## [6] "2016-03-18"
# 对数据表格中的数据按照时间顺序排列,这里只选取前6行,部分列做展示
head(movie[order(movie$showtime), c("name", "showtime")])
## name showtime
## 6 功夫熊猫3 2016-01-29
## 2 美人鱼 2016-02-08
## 4 西游记之孙悟空三打白骨精 2016-02-08
## 5 澳门风云三 2016-02-08
## 1 叶问3 2016-03-04
## 3 女汉子真爱公式 2016-03-18