注:R语言中<-和=是有一定区别的,<-是全局作用域 =是局部作用域
读取数据集 想复现的同学私聊我获取数据集
cars2 <- read.csv(file = "../cars2.txt",stringsAsFactors=TRUE)
汽车重量直方图
par(mfrow=c(1,1))
hist(cars2$weight,breaks=30,xlim=c(0,5000),col="blue",border="black",ylim=c(0,40),xlab="Weight",ylab="Counts",main="Histogram of Car Weights")
box(which="plot",lty="solid",col="black")
创建散点图
plot(cars2$weight,cars2$mpg,xlim=c(0,5000),ylim=c(0,600),xlab="Weight",ylab="MPG",main="scatterplot of MPG by Weight",type="p",pch=16,col="blue")
points(cars2$weight,cars2$mpg,type="p",col="black")
算箱图值
summary(cars$weight)#min median mean max Q1 Q2
Min. 1st Qu. Median Mean 3rd Qu. Max.
1613 2246 2835 3005 3664 4997
mean(cars$weight)
median(cars$weight)
length(cars$weight)
sd(cars$weight)
summary(cars$weight)#min median mean max Q1 Q2
变换 min-max规范化;Z-score标准化;小数定标规范化
#min-max规范化
mi <- min(cars$weight)
ma <- max(cars$weight)
minmax.weight <- (cars$weight-mi)/(ma-mi)
minmax.weight
#Z-score标准化
m <- mean(cars$weight)
s <- sd(cars$weight)
z.weight <- (cars$weight-m)/s
z.weight
#小数定标规范化
d.weight <- cars$weight/(10^4)
#并排柱状图
par(mfrow=c(1,2))
hist(cars$weight,breaks=20,xlim=c(1000,5000),main="Histogram of Weight",xlab="Weight",ylab="Counts")
box(which="plot",lty="solid",col="black")
hist(z.weight,breaks=20,xlim=c(-2,3),main="Histogram Z-score of Weight",xlab="Z-score of Weight",ylab="Counts")
box(which="plot",lty="solid",col="black")
#倾斜程度
(3*(mean(cars$weight)-median(cars$weight)))/sd(cars$weight)
#0.5998638>0
#右倾数据 正倾斜
# 正态转换 (平方根 自然对数 逆平方根)
(3*(mean(cars$weight)-median(cars$weight)))/sd(cars$weight)
sqrt.weight <- sqrt(cars$weight)
sqrt.weight_skew <- (3*(mean(sqrt.weight)-median(sqrt.weight)))/sd(sqrt.weight)
ln.weight <-log(cars$weight)
ln.weight_skew <- (3*(mean(ln.weight)-median(ln.weight)))/sd(ln.weight)
invsqrt.weight <- 1/sqrt(cars$weight)
invsqrt.weight_skew <- (3*(mean(invsqrt.weight)-median(invsqrt.weight)))/sd(invsqrt.weight)
#正态分布直方图
par(mfrow=c(1,1))
x <- rnorm(1000000,mean=mean(invsqrt.weight),sd=sd(invsqrt.weight))
hist(invsqrt.weight,breaks=30,xlim=c(0.0125,0.0275),col="lightblue",prob=TRUE,boder="black",xlab="Inverse Square Root of Weight",ylab="Counts",main="Histogram of Inverse Square Root of Weight")
box(which="plot",lty="solid",col="black")
lines(density(x),col="red")
#正态Q-Q图
qqnorm(invsqrt.weight,datax=TRUE,col="red",ylim=c(0.01,0.03),main="Normal Q—QPlot Inverse Square Root of Weight")
qqline(invsqrt.weight,col="blue",datax=TRUE)
#数据逆变换
x <- cars$weight[1]
y <- 1/sqrt(x)
de_x <- 1/y^2
x
y
de_x
> x
[1] 4209
> y
[1] 0.01541383
> de_x
[1] 4209
#通过某索引字段排列表
cars[order(cars$mpg),]
#检查重复记录
#检查重复记录
duplicated(cars)
#记录重复记录
anyDuplicated(cars)