R语言手册（第二站数据预处理）

最新推荐文章于 2024-07-12 20:34:48 发布

纸境止境

最新推荐文章于 2024-07-12 20:34:48 发布

阅读量1.4k

点赞数 1

分类专栏：多元统计分析 R语言文章标签： R语言数据分析数据挖掘

本文链接：https://blog.csdn.net/qq_40891541/article/details/89490678

版权

多元统计分析同时被 2 个专栏收录

19 篇文章 1 订阅

订阅专栏

R语言

9 篇文章 0 订阅

订阅专栏

R语言手册（第二站数据预处理）

标签： R语言

1.读入数据集Cars和Cars2

cars<read.csv（"C:/./cars.txt"，stringsAsFactors=FALSE）
cars2<read.csv（"C:/./cars2.txt"，stringsAsFactors=FALSE）

2.缺失数据

#观察数据集cars中的4个变量
cars.4var<-carsL，c（1，3，4，8）]
head（cars.4var）

3.确定缺失的某些条目

cars.4var[2，2]<-cars.4var[4，4]<-NA 
head（cars.4var）

4.使用常量替换缺失值

cars.4var[2，2]<-0
cars.4var[4，4]<-"Missing"
head（cars.4var）

5.使用均值和众数替换缺失值

cars.4var[2，2]<-mean（na.omit（cars.4varScubicinches））
our_table<-table（cars.4varSbrand）
our_mode<-names（our table）[our_table=max（our_table）]
cars.4var[4，4]<-our_mode 
head（cars.4var）

6.生成随机观测值

obs brand<-
sample（na.omit（cars.4varSbrand），1）
obs_cubicinches<-
sample（na.omit（cars.4varScubicinches），1）
cars.4var[2，2]<-obs cubicinches cars.4var[4，4]<-obs_brand head（cars.4var）

7.创建直方图

#设置绘图区域
par（mfrow=c（1，1））
#创建直方图
hist(cars2$weight, 
    breaks=30，
    xlim=c(0，5000，
    col="blue"，
    border="black"，
    ylim=c(0，40)，
    xlab="Weight"，
    ylab="Counts"，
    main="Histogram of Car Weights")
#在图周围创建边框
box(which="plot"，lty="solid"，col="black")

8.创建散点图

plot(cars2S weight，cars2Smpg，xlim=c(0，5000)，ylim=c(0，600),xlab="Weight"， ylab="MPG"，main="Scatterplot of MPG by Weight"，type="p"，pch=16，col="blue"）
#添加空心黑色圆点
points(cars2Sweight，cars2Smpg， type="p"，col="black”）

9.统计描述

mean（carsSweight）       #均值
median（carsSweight）     #中值
length（carsSweight）     #观测次数
sd（carsSweight）         #标准差
summary（carsSweight）    #最小值、Q1、中值、均值、Q3、最大值

10.变换

#min-max 规范化
summary（carsSweight）
mic-min（carsSweight）
ma<-max（carsSweight）
minmax.weight<-（carsSweight-mi）y（ma-mi）
minmax.weight

#Z-score标准化
ms-mean（carsSweight）.s<-sd（carsSweight）
z.weight<-（carsSweight-m）/s
z.weight
length（carsSweight）

#小数定标规范化
max（abs（carsSweight）） #四位数
d.weight <-cars$weight/(10^4);d.weigh

11.并排柱状图

par（mfrow=c（1，2））
#创建两个直方图
hist（carsSweight，breaks=20，xlim=c（1000，5000），main="Histogram of Weight"，xlab="Weight"，ylab="Counts"）
box（which="plot"，Ity="solid"，col="black"）

hist(z.weight, 
    breaks=20, 
    xlim=c(-2,3), 
    main="Histogram of Z-score of Weight", 
    xlab="Z-score of Weight", 
    ylab="Counts")
box(which="plot", lty="solid", col="black")

12.倾斜度

（3*（mean（carsSweight）-median（carsSweight））/sd（carsSweight）
（3*（mean（z.weight）-median（z.weight）/sd（z.weight）

13.正态转换

#平方根
sqrt.weight<-sqrt（carsSweight） 
sqL.weight skew<-（3*（mean（sqrt weight）-median（soqrt.weight）/sd（sqrt.weigt）
#自然对数
In.weigh t<-log（carsSweight） 
In.weight_skew<-（3*（mean（In.weight）-median（In.weight））/sd（In.weight）
#逆平方根
invsqrt.weight<-1/sqrt（carsSweight） 
invsqrt.weight_skew<-（3*（mean（invsqrt.weight）-median(invsqrt.weight))/sd(invsqrt.weight）

14.正态分布直方图

par(mfrow=c(1,1))
x<-rnorm(1000000, 
        mean = mean(invsqrt.weight), 
        (invsqrt.weight), 
        sd = sd(invsqrt.weight))

hist(invsqrt.weight, 
    breaks=30, 
    xlim=c(0.0125, 0.0275), 
    col="lightblue", 
    prob=TURE, 
    border="black", 
    xlab="Inverse Square Root of Weight",
    ylab="Counts", 
    main="Histogram of Inverse Square Root of Weight")
box(which = "plot", lty="solid", col="black")

#正态密度覆盖图
lines(density(x), col="red")

15.正态Q-Q图

qqnorm(invsqrt.weight, 
        datax=TRUE, 
        col="red", 
        ylim=c(0.01,0.03), 
        main="Normal Q-Q Plot of Inverse Square Root of Weight")

qqline(invsqrt. weight, col="blue", datax=TRUE)

16.数据逆变换

#使用y=1/sqrt(x)变换x 
x<-cars$weight[1]；y<-1/sqrt(x)
#使用x=l/(y)^2逆变换x 
detransformedx<-1/y^2
x；y；detransformedx

17.创建指示变量

north_flag<- east_flag<-south_ flags<-c(rep(NA,10))
region<-(rep(o("north","south","east","west"),2),"north","south")
#将区域变量转换为指示变量
for（i in l:length(region)){
    if(region[i]="north"）north_flag[i]=l 
    else north_flag[i]=0
    if(tregionf[i]="east"）east_flag[i]=1
    else east flag[i]=0
    ifregjonfi]="south'"）south_flag[i]=1
    else south_flag[i]=0
    north flag；east flag；south flag

18.索引字段

#数据帧有一个索引字段；
#数据集cars的最左列
caars[order(cars$mpg),]

#对于向量或矩阵，
#添加一列作为索引字段
x<-c(1,1,3:1,1:4,3);y<-c(9,9:1)
z<-c(2,1:9) 
matrix<-t(rbind(x,y,z));matrix 
indexed m<-cbind(c(1:length(x),matrix); 
indexed_m 
indexed_mlorder(z),]

19.重复记录

#使用anyDuplicated记录重复记录数
anyDuplicated（cars）
#使用duplicated检查重复记录
duplicated（cars）
#“True”：记录是重复的，
#“False”：记录不是重复的

#复制第一条记录
new.cars<-rbind(cars，cars[1，])
#检查重复记录
anyDuplicated(new.cars)
#第262条记录是重复的
duplicated(new.cars)