R语言手册(第二站 数据预处理)

R语言手册(第二站 数据预处理)

标签: R语言

1.读入数据集Cars和Cars2
cars<read.csv("C:/./cars.txt",stringsAsFactors=FALSE)
cars2<read.csv("C:/./cars2.txt",stringsAsFactors=FALSE)
2.缺失数据
#观察数据集cars中的4个变量
cars.4var<-carsL,c(1,3,4,8)]
head(cars.4var)

输出:image_1d9251pgb65j1jb41phg11se16659.png-26.6kB

3.确定缺失的某些条目
cars.4var[2,2]<-cars.4var[4,4]<-NA 
head(cars.4var)

输出:image_1d9258bff1huaui34e1124hm9f26.png-26.3kB

4.使用常量替换缺失值
cars.4var[2,2]<-0
cars.4var[4,4]<-"Missing"
head(cars.4var)

输出:image_1d925esi614d5f8f1h7h1n671h0t2j.png-25.7kB

5.使用均值和众数替换缺失值
cars.4var[2,2]<-mean(na.omit(cars.4varScubicinches))
our_table<-table(cars.4varSbrand)
our_mode<-names(our table)[our_table=max(our_table)]
cars.4var[4,4]<-our_mode 
head(cars.4var)

输出:image_1d925hd5p1h5q1dbe11eg1ltn9f030.png-29kB

6.生成随机观测值
obs brand<-
sample(na.omit(cars.4varSbrand),1)
obs_cubicinches<-
sample(na.omit(cars.4varScubicinches),1)
cars.4var[2,2]<-obs cubicinches cars.4var[4,4]<-obs_brand head(cars.4var)

输出:image_1d925ksul1q0d1ffs44d1nmq1a563d.png-29.1kB

7.创建直方图
#设置绘图区域
par(mfrow=c(1,1))
#创建直方图
hist(cars2$weight, 
    breaks=30,
    xlim=c(0,5000,
    col="blue",
    border="black",
    ylim=c(0,40),
    xlab="Weight",
    ylab="Counts",
    main="Histogram of Car Weights")
#在图周围创建边框
box(which="plot",lty="solid",col="black")

输出:image_1d925nb4b8461ndifes1uc51r4n3q.png-31.9kB

8.创建散点图
plot(cars2S weight,cars2Smpg,xlim=c(0,5000),ylim=c(0,600),xlab="Weight", ylab="MPG",main="Scatterplot of MPG by Weight",type="p",pch=16,col="blue")
#添加空心黑色圆点
points(cars2Sweight,cars2Smpg, type="p",col="black”)
9.统计描述
mean(carsSweight)       #均值
median(carsSweight)     #中值
length(carsSweight)     #观测次数
sd(carsSweight)         #标准差
summary(carsSweight)    #最小值、Q1、中值、均值、Q3、最大值
10.变换
#min-max 规范化
summary(carsSweight)
mic-min(carsSweight)
ma<-max(carsSweight)
minmax.weight<-(carsSweight-mi)y(ma-mi)
minmax.weight
#Z-score标准化
ms-mean(carsSweight).s<-sd(carsSweight)
z.weight<-(carsSweight-m)/s
z.weight
length(carsSweight)
#小数定标规范化
max(abs(carsSweight)) #四位数
d.weight <-cars$weight/(10^4);d.weigh
11.并排柱状图
par(mfrow=c(1,2))
#创建两个直方图
hist(carsSweight,breaks=20,xlim=c(1000,5000),main="Histogram of Weight",xlab="Weight",ylab="Counts")
box(which="plot",Ity="solid",col="black")

hist(z.weight, 
    breaks=20, 
    xlim=c(-2,3), 
    main="Histogram of Z-score of Weight", 
    xlab="Z-score of Weight", 
    ylab="Counts")
box(which="plot", lty="solid", col="black")

输出:image_1d926beg8sd2g5i15tvvuu1jp247.png-76.2kB

12.倾斜度
(3*(mean(carsSweight)-median(carsSweight))/sd(carsSweight)
(3*(mean(z.weight)-median(z.weight)/sd(z.weight)
13.正态转换
#平方根
sqrt.weight<-sqrt(carsSweight) 
sqL.weight skew<-(3*(mean(sqrt weight)-median(soqrt.weight)/sd(sqrt.weigt)
#自然对数
In.weigh t<-log(carsSweight) 
In.weight_skew<-(3*(mean(In.weight)-median(In.weight))/sd(In.weight)
#逆平方根
invsqrt.weight<-1/sqrt(carsSweight) 
invsqrt.weight_skew<-(3*(mean(invsqrt.weight)-median(invsqrt.weight))/sd(invsqrt.weight)

14.正态分布直方图

par(mfrow=c(1,1))
x<-rnorm(1000000, 
        mean = mean(invsqrt.weight), 
        (invsqrt.weight), 
        sd = sd(invsqrt.weight))

hist(invsqrt.weight, 
    breaks=30, 
    xlim=c(0.0125, 0.0275), 
    col="lightblue", 
    prob=TURE, 
    border="black", 
    xlab="Inverse Square Root of Weight",
    ylab="Counts", 
    main="Histogram of Inverse Square Root of Weight")
box(which = "plot", lty="solid", col="black")

#正态密度覆盖图
lines(density(x), col="red")

输出:image_1d96gau8gbrn124ui71tddhj71p.png-40.7kB

15.正态Q-Q图

qqnorm(invsqrt.weight, 
        datax=TRUE, 
        col="red", 
        ylim=c(0.01,0.03), 
        main="Normal Q-Q Plot of Inverse Square Root of Weight")

qqline(invsqrt. weight, col="blue", datax=TRUE)

输出:image_1d96gddl4i9d1nb819mg45npk726.png-31.9kB

16.数据逆变换

#使用y=1/sqrt(x)变换x 
x<-cars$weight[1];y<-1/sqrt(x)
#使用x=l/(y)^2逆变换x 
detransformedx<-1/y^2
x;y;detransformedx

输出:image_1d96gigcd1urr1borm06011d752j.png-10.2kB

17.创建指示变量

north_flag<- east_flag<-south_ flags<-c(rep(NA,10))
region<-(rep(o("north","south","east","west"),2),"north","south")
#将区域变量转换为指示变量
for(i in l:length(region)){
    if(region[i]="north")north_flag[i]=l 
    else north_flag[i]=0
    if(tregionf[i]="east")east_flag[i]=1
    else east flag[i]=0
    ifregjonfi]="south'")south_flag[i]=1
    else south_flag[i]=0
    north flag;east flag;south flag

输出:image_1d96gt3op12ta1tkl7mv1i0mj3030.png-15.7kB

18.索引字段

#数据帧有一个索引字段;
#数据集cars的最左列
caars[order(cars$mpg),]

#对于向量或矩阵,
#添加一列作为索引字段
x<-c(1,1,3:1,1:4,3);y<-c(9,9:1)
z<-c(2,1:9) 
matrix<-t(rbind(x,y,z));matrix 
indexed m<-cbind(c(1:length(x),matrix); 
indexed_m 
indexed_mlorder(z),]

19.重复记录

#使用anyDuplicated记录重复记录数
anyDuplicated(cars)
#使用duplicated检查重复记录
duplicated(cars)
#“True”:记录是重复的,
#“False”:记录不是重复的

#复制第一条记录
new.cars<-rbind(cars,cars[1,])
#检查重复记录
anyDuplicated(new.cars)
#第262条记录是重复的
duplicated(new.cars)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值