R基本数据操作

最新推荐文章于 2024-04-17 17:31:12 发布
你看起来很好吃
最新推荐文章于 2024-04-17 17:31:12 发布
阅读量2.3k
点赞数 2
分类专栏： R
R 专栏收录该内容
1 篇文章 0 订阅
订阅专栏
library(MASS)
# 查看可用数据集
data()
# 获取数据集
data(Insurance)
# 数据集的介绍
?Insurance
summary(Insurance)
# 维度
dim(Insurance)
# 变量名
names(Insurance)
# top\tail records
head(Insurance)
tail(Insurance)
# 类型
class(Insurance$District)
# 因子水平值
levels(Insurance$Age)
# 是否为数值型数据
is.numeric(Insurance$District)
# # 数据抽样
# 1 简单随机抽样
sub1 <- sample(nrow(Insurance),# 行号
               10,             # 样本个数
               replace = T     # 有放回
               )
sub1
Insurance[sub1,]
# 2 分层抽样 
install.packages("sampling")
library(sampling)
# 分层抽样应对依据变量升序排序。
Insurance <- Insurance[order(Insurance$District),]
sub2 <- strata(Insurance,
               stratanames = "District", # 分层依据变量
               size = c(1,2,3,4),        # 各层样本数
               method = "srswor"         # 无放回
               )
sub2
getdata(Insurance,sub2)
# 3 整群抽样 cluster()
# 训练集、测试集
train_sub <- sample(nrow(Insurance),3/4*nrow(Insurance))
train_data <- Insurance[train_sub,]
test_data <- Insurance[-train_sub,]
dim(train_data);dim(test_data)

# # 用R 获取数据
# CSV
getwd()
setwd("/Users/sun/dianping/learning material/DMLearningTeam/Part2")
write.csv(Insurance,"Insurance.csv")
Insur_csv <- read.csv("Insurance.csv")
head(Insur_csv)
Insur_csv1 <- read.table("Insurance.csv",header = T,sep = ",")
head(Insur_csv1)
# TXT
write.table(Insurance,"Insurance.txt")
Insur_txt <- read.table("Insurance.txt")
head(Insur_txt)
Insur_txt1 <- read.csv("Insurance.txt",header = T,sep = "")
head(Insur_txt1)
# EXCEL
# 1.转换为csv
# 2.xlsx
library(xlsx)
write.xlsx(Insurance,"Insurance.xlsx")
Insur_xlsx <- read.xlsx("Insurance.xlsx",1)  # 取第一个sheet
head(Insur_xlsx)
# 使用foreign包读写SPSS,SAS等统计软件数据。

# # 探索性数据分析
#数字化探索
attributes(Insurance)  # 属性列表
str(Insurance)  # 内部结构
summary(Insurance) # 变量概括
install.packages("Hmisc")
library(Hmisc)
describe(Insurance[,c(1:2,4)])
#install.packages("fBasics") #basicStats()
# 偏度和峰度
library(timeDate)
skewness(Insurance[,4:5]) # 大于0，正偏，均值右偏，右长尾现象
kurtosis(Insurance[,4:5]) # 大于0，陡峭。
# 稀疏性
install.packages("Matrix")
library(Matrix)
i <- sample(1:10,10,replace = T)
j <- sample(1:10,10,replace = T)
(A <- sparseMatrix(i,j,x = 1))
loca <- which(A== 1,arr.ind = T)
plot(loca,pch = 22)
# 缺失值
install.packages("mice") # md.pattern(data.set)
# 相关性
#install.packages("rattle")
library(rattle)
data(weather)
var <- c(12:21)
# 相关系数矩阵
(cor_matrix <- cor(weather[var],use = "pairwise"))
#install.packages("ellipse")
library(ellipse)
# 圆形越窄，相关性越高；向右倾斜表示正相关。
plotcorr(cor_matrix,diag = T,type = "lower",col = rep(c("white","black"),5))
# 可视化探索
# 直方图 (大盘订单金额分布)
hist(Insurance$Claims,main = "Histogram of Freq of Insurance$Claims")
hist(Insurance$Claims,freq = F, # not frequency
     density = 20, # 添加阴影
     main = "Histogram of Density of Insurance$Claims"
     )
lines(density(Insurance$Claims)) # 概率密度曲线
#自定义组距
str( hist(Insurance$Claims,breaks = 20, #组距
          labels = T, # 标注频数
          col = "black", border = "white", # 矩形颜色
          main = "Histogram of Insurance$Claims with 20 bars"
          )
  )
# 箱形图
Claims_bp <- boxplot(Insurance$Claims,
                     main = "Distribution of Claims")
Claims_bp$stats
Claims_bp$out
class(Claims_bp$out)
## 标柱重要标记点
points(x = 1,y = mean(Insurance$Claims), pch = 8)
# 获取异常值的取值
#Claims_points <- as.matrix(Insurance$Claims[which(Insurance$Claims > 102)],6,1)
Claims_points <- as.matrix(Claims_bp$out)
# 汇总12个重要点
Claims_text <- rbind(Claims_bp$stats,mean(Insurance$Claims),Claims_points)
# 标柱
for(i in 1:length(Claims_text)) {
  text(x = 1.1,y = Claims_text[i,], labels = Claims_text[i,])
}
# 各年龄段中要求索赔人数的分布
# 按Age 排序。
attach(Insurance)
data_plot <- rbind( 
  data.frame(var1 = Claims[Age == "<25"],var2 = "<25"),
  data.frame(var1 = Claims[Age == "25-29"],var2 = "25-29"),
  data.frame(var1 = Claims[Age == "30-35"],var2 = "30-35"),
  data.frame(var1 = Claims[Age == ">35"],var2 = ">35")
  )
detach(Insurance)
data_plot
boxplot(var1~var2,data_plot,horizontal = T, # 横向输出图形
        main = "Distribution of Claims by Age",xlab = "Claims",ylab = "Age"
        )
# 年龄大的更容易索赔？
# 结合投保人来看
attach(Insurance)
Holders_bp <- boxplot(Holders~Age,boxwex = 0.25, # 箱子大小
              at = 1:4+0.2 # 箱子的位置
              )
Claims_bp2 <- boxplot(var1~var2,data_plot,add = T, # 图形叠加
              boxwex = 0.25,at = 1:4 - 0.2,
              col = "lightgrey", # 浅灰色显示
              main = "Distribution of Claims by Age",
              xlab = "Age",ylab = "Claims&Holders"
              )
abline(h = Holders_bp$stats[2,4],lty = 2) # 下四分位 虚线
abline(h = Claims_bp2$stats[2,4],lty = 2)
# 图例
legend(x = "topleft",c("Claims","Holders"),fill = c("lightgrey","white"))
detach(Insurance)

# 条形图
# 按年龄分组求索赔变量之和
Claims_Age <- tapply(Insurance$Claims,Insurance$Age,sum)
barplot(Claims_Age,names.arg = levels(Insurance$Age),
        density = 20,
        main = "Distribution of Age by Claims",xlab = "Age",ylab = "Claims")
# 加入投保人变量
Holders_Age <- tapply(Insurance$Holders,Insurance$Age,sum)
bar_age <- rbind(Claims_Age,Holders_Age)
bar_age
barplot(bar_age,names.arg = levels(Insurance$Age),
        beside = T, # 分组条形图 ,默认FALSE是堆叠条形图
        main = "Age Distribution by Claims and Holders",
        xlab = "Age",ylab = "Claims&Holders",col = c("black","darkgrey")
        )
legend(x = "topleft",rownames(bar_age),fill = c("black","darkgrey"))
# 点阵图
dotchart(bar_age,xlab = "Claims&Holders",pch = 1:2,
         main = "Age Distribution by Claims and Holders")
# 饼图
pie(Claims_Age,labels = levels(Insurance$Age),
    main = "Pie chart of Age by Claims",
    col = c("white","lightgray","darkgrey","black"))
# 增加百分比
percent <- round(Claims_Age/sum(Claims_Age)*100)
labels <- paste(levels(Insurance$Age),":",percent,"%",sep = "")
labels
pie(Claims_Age,labels = labels,
    main = "Pie chart of Age by Claims",
    col = c("white","lightgray","darkgrey","black"))

## 数据预处理
library(mice)
data()
data(nhanes2)
summary(nhanes2)
str(nhanes2)
head(nhanes2)
# 数据清洗
# 缺失值
sum(is.na(nhanes2))
# 完整样本的数量
sum(complete.cases(nhanes2))
# 观测数据缺失状况
md.pattern(nhanes2)
# 删除法 、
# 均值插补、回归插补、二阶插补、热平台、冷平台、抽样填补等单一变量插补、
# 1、从完整样本中随机抽取等量样本代替缺失样本
sub <- which(is.na(nhanes2[,4]))
dataTR <- nhanes2[-sub,]
dataTE <- nhanes2[sub,]
dataTE[,4] <- sample(dataTR[,4],length(sub),replace = T)
dataTE
# 2、均值插补
dataTE <- nhanes2[sub,]
dataTE[,4] <- mean(dataTR[,4])
dataTE
# 3、回归预测缺失值
dataTE <- nhanes2[sub,]
lm <-lm(chl~age,data = dataTR)
dataTE[,4] <- round(predict(lm,dataTE))
dataTE
# 4、热平台（相似样本代替）
accept <- nhanes2[which(apply(is.na(nhanes2),1,sum) != 0),]  #存在缺失样本
donate <- nhanes2[which(apply(is.na(nhanes2),1,sum) == 0),]  #无缺失样本
accept[2,]
accept[2,2] <- donate[which(donate$age == accept[2,1] 
                           & donate$hyp == accept[2,3]
                           & donate$chl == accept[2,4]),2]
# 5、冷平台（分层，层内均值插补）
level1 <- nhanes2[which(nhanes2$hyp == "yes"),]
level1
level1[which(is.na(level1$chl)),4] <- mean(level1[which(!is.na(level1$chl)),4])

#多重插补法
imp <- mice(nhanes2,4)  # 生成4组完整的数据库
fit <- with(imp,lm(chl~age+hyp+bmi))
pooled <- pool(fit) # 汇总4个模型
summary(pooled)

# 噪声数据处理 （箱形图、点阵图）g
library(outliers)  # 与其他观测值及均值差距最大的点作为异常值
set.seed(1)  # 设置随机数种子，保证每次出现的随机数相同
y <- rnorm(100)  # 100个标准正态随机数
y
outlier(y)
outlier(y,opposite = T)  # 离群最远值的相反值
dotchart(y)
dim(y) <- c(20,5)
outlier(y) # 每列的离群最远值
outlier(y,logical = T)
boxplot(y)

# 数据不一致(人工监测、vapply批量监测)
x <- list(a = 1:10,beta = exp(-3:3),logic = c(T,F,F,F))
x 
probs <- c(1:3/4)
return_value <- c(0,0,0)
vapply(x,quantile,FUN.VALUE = return_value,probs = probs)
probs <- c(1:4/4)
return_value <- c(0,0,0,"")

# 数据集成遇到的问题。
# 1 数据冗余
x <- cbind(sample(c(1:50),10),sample(c(1:50),10))
chisq.test(x) # 拒绝原假设（相关）
cor(x)
cov(x)
# 2 观测值重复
x <- cbind(sample(c(1:10),10,replace = T),rnorm(10),rnorm(10))
y <- unique(x[,1])
y
sub <- rep(0,length(y))
for(i in 1:length(y)) {
  sub[i] <- which(x[,1] == y[i])
}
sub
x[sub,]

# 数据变换
# 1、光滑 2、属性构造（ROI）3、聚集
# 4、规范化
set.seed(1)
a <- rnorm(5)
b <- scale(a)
b
# 5、离散化 6、概念分层（上聚）

# 数据归约（压缩数据量）AIC准则；LASSO；信息熵；主成分分析
# 维归约 LASSO
library(glmnet)
x <- matrix(rnorm(100*20),100,20)  # 自变量
y <- rnorm(100)  # 因变量
fit1 <- glmnet(x,y) # 广义线性回归，自变量未分组的，默认为LASSO
b <- coef(fit1,s = 0.05)  #s减小，约束放宽，筛选的变量越多
b  # 有值的被选入模型
predict(fit1,newx = x[1:10,], s = c(0.01,0.005))

# 保存工作空间
save(list = ls(),file = "preprocessing.RData")
setwd("/Users/sun/dianping/learning material/DMLearningTeam/Part2/")
load("preprocessing.RData")
你看起来很好吃
关注
2
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
R基本数据操作

library(MASS)# 查看可用数据集data()# 获取数据集data(Insurance)# 数据集的介绍?Insurancesummary(Insurance)# 维度dim(Insurance)# 变量名names(Insurance)# top\tail recordshead(Insurance)tail(Insurance)# 类型class(I
复制链接

扫一扫