R语言手册(第二站 数据预处理)
标签: R语言
1.读入数据集Cars和Cars2
cars<read.csv("C:/./cars.txt",stringsAsFactors=FALSE)
cars2<read.csv("C:/./cars2.txt",stringsAsFactors=FALSE)
2.缺失数据
#观察数据集cars中的4个变量
cars.4var<-carsL,c(1,3,4,8)]
head(cars.4var)
输出:
3.确定缺失的某些条目
cars.4var[2,2]<-cars.4var[4,4]<-NA
head(cars.4var)
输出:
4.使用常量替换缺失值
cars.4var[2,2]<-0
cars.4var[4,4]<-"Missing"
head(cars.4var)
输出:
5.使用均值和众数替换缺失值
cars.4var[2,2]<-mean(na.omit(cars.4varScubicinches))
our_table<-table(cars.4varSbrand)
our_mode<-names(our table)[our_table=max(our_table)]
cars.4var[4,4]<-our_mode
head(cars.4var)
输出:
6.生成随机观测值
obs brand<-
sample(na.omit(cars.4varSbrand),1)
obs_cubicinches<-
sample(na.omit(cars.4varScubicinches),1)
cars.4var[2,2]<-obs cubicinches cars.4var[4,4]<-obs_brand head(cars.4var)
输出:
7.创建直方图
#设置绘图区域
par(mfrow=c(1,1))
#创建直方图
hist(cars2$weight,
breaks=30,
xlim=c(0,5000,
col="blue",
border="black",
ylim=c(0,40),
xlab="Weight",
ylab="Counts",
main="Histogram of Car Weights")
#在图周围创建边框
box(which="plot",lty="solid",col="black")
输出:
8.创建散点图
plot(cars2S weight,cars2Smpg,xlim=c(0,5000),ylim=c(0,600),xlab="Weight", ylab="MPG",main="Scatterplot of MPG by Weight",type="p",pch=16,col="blue")
#添加空心黑色圆点
points(cars2Sweight,cars2Smpg, type="p",col="black”)
9.统计描述
mean(carsSweight) #均值
median(carsSweight) #中值
length(carsSweight) #观测次数
sd(carsSweight) #标准差
summary(carsSweight) #最小值、Q1、中值、均值、Q3、最大值
10.变换
#min-max 规范化
summary(carsSweight)
mic-min(carsSweight)
ma<-max(carsSweight)
minmax.weight<-(carsSweight-mi)y(ma-mi)
minmax.weight
#Z-score标准化
ms-mean(carsSweight).s<-sd(carsSweight)
z.weight<-(carsSweight-m)/s
z.weight
length(carsSweight)
#小数定标规范化
max(abs(carsSweight)) #四位数
d.weight <-cars$weight/(10^4);d.weigh
11.并排柱状图
par(mfrow=c(1,2))
#创建两个直方图
hist(carsSweight,breaks=20,xlim=c(1000,5000),main="Histogram of Weight",xlab="Weight",ylab="Counts")
box(which="plot",Ity="solid",col="black")
hist(z.weight,
breaks=20,
xlim=c(-2,3),
main="Histogram of Z-score of Weight",
xlab="Z-score of Weight",
ylab="Counts")
box(which="plot", lty="solid", col="black")
输出:
12.倾斜度
(3*(mean(carsSweight)-median(carsSweight))/sd(carsSweight)
(3*(mean(z.weight)-median(z.weight)/sd(z.weight)
13.正态转换
#平方根
sqrt.weight<-sqrt(carsSweight)
sqL.weight skew<-(3*(mean(sqrt weight)-median(soqrt.weight)/sd(sqrt.weigt)
#自然对数
In.weigh t<-log(carsSweight)
In.weight_skew<-(3*(mean(In.weight)-median(In.weight))/sd(In.weight)
#逆平方根
invsqrt.weight<-1/sqrt(carsSweight)
invsqrt.weight_skew<-(3*(mean(invsqrt.weight)-median(invsqrt.weight))/sd(invsqrt.weight)
14.正态分布直方图
par(mfrow=c(1,1))
x<-rnorm(1000000,
mean = mean(invsqrt.weight),
(invsqrt.weight),
sd = sd(invsqrt.weight))
hist(invsqrt.weight,
breaks=30,
xlim=c(0.0125, 0.0275),
col="lightblue",
prob=TURE,
border="black",
xlab="Inverse Square Root of Weight",
ylab="Counts",
main="Histogram of Inverse Square Root of Weight")
box(which = "plot", lty="solid", col="black")
#正态密度覆盖图
lines(density(x), col="red")
输出:
15.正态Q-Q图
qqnorm(invsqrt.weight,
datax=TRUE,
col="red",
ylim=c(0.01,0.03),
main="Normal Q-Q Plot of Inverse Square Root of Weight")
qqline(invsqrt. weight, col="blue", datax=TRUE)
输出:
16.数据逆变换
#使用y=1/sqrt(x)变换x
x<-cars$weight[1];y<-1/sqrt(x)
#使用x=l/(y)^2逆变换x
detransformedx<-1/y^2
x;y;detransformedx
输出:
17.创建指示变量
north_flag<- east_flag<-south_ flags<-c(rep(NA,10))
region<-(rep(o("north","south","east","west"),2),"north","south")
#将区域变量转换为指示变量
for(i in l:length(region)){
if(region[i]="north")north_flag[i]=l
else north_flag[i]=0
if(tregionf[i]="east")east_flag[i]=1
else east flag[i]=0
ifregjonfi]="south'")south_flag[i]=1
else south_flag[i]=0
north flag;east flag;south flag
输出:
18.索引字段
#数据帧有一个索引字段;
#数据集cars的最左列
caars[order(cars$mpg),]
#对于向量或矩阵,
#添加一列作为索引字段
x<-c(1,1,3:1,1:4,3);y<-c(9,9:1)
z<-c(2,1:9)
matrix<-t(rbind(x,y,z));matrix
indexed m<-cbind(c(1:length(x),matrix);
indexed_m
indexed_mlorder(z),]
19.重复记录
#使用anyDuplicated记录重复记录数
anyDuplicated(cars)
#使用duplicated检查重复记录
duplicated(cars)
#“True”:记录是重复的,
#“False”:记录不是重复的
#复制第一条记录
new.cars<-rbind(cars,cars[1,])
#检查重复记录
anyDuplicated(new.cars)
#第262条记录是重复的
duplicated(new.cars)