#基本图形
install.packages("vcd")
library("grid")
library("vcd")
#条形图
#通过垂直的或水平的条形展示了类别型变量的分布(频数)
counts <- table(Arthritis$Improved)
barplot(counts,
main = "Simple Bar Plot",
xlab = "Improvement",ylab = "Frequency") #简单纵向条形图
barplot(counts,
main = "Simple Bar Plot",
ylab = "Improvement",xlab = "Frequency",horiz = TRUE) #简单横向条形图,坐标轴描述需要同步变化
#对于有序型因子,利用plot可以直接画图
class(Arthritis$Improved) #反馈"ordered" "factor" ,有序型因子
plot(Arthritis$Improved,main="Simple Bar Plot",
xlab="Improvement",ylab = "Frequency") #简单纵向条形图
plot(Arthritis$Improved,main="Simple Bar Plot",
ylab = "Improvement",xlab = "Frequency",horiz = TRUE) #简单横向条形图,坐标轴描述需要同步变化
#堆砌条形图
#数据源准备,列联表
library("vcd")
counts = table(Arthritis$Improved,Arthritis$Treatment)
counts
opar <- par(no.readonly = TRUE)
par(mai=c(0.5,0.5,0.5,0.5)) #设置绘图参数mai=c(下,左,上,右)的边界空间。
barplot (counts,main="Stacked Bar Plot",
xlab = "Treatment",ylab = "Frequency",
col = c("red","yellow","green")
) #堆砌条形图,一个长条图,包含若干个小条形
legend(locator(1),legend=rownames(counts),
col = c("red","yellow","green"),
pch=c(15),
title="Drug Improved",xpd=TRUE)
par(opar)
barplot(counts,main="Grouped Bar Plot",
xlab ="Treatment",ylab="Frequency",
col = c("red","yellow","green"),
beside=TRUE) #分组条形图,一个条形图,包含若干邻居式的小条形
legend(locator(1),legend=rownames(counts),
col = c("red","yellow","green"),
pch=c(15),
title="Drug Improved")
# 均值条形图
# state.region #美国50个州的地理分类,因子
# state.x77 #美国50个州的八个指标,矩阵
# 基于region列,对新df的Illiteracy列,聚合,并计算每个组的mean值
states <- data.frame(state.region,state.x77)
means <- aggregate(states$Illiteracy,by=list(state.region),FUN=mean)
means <- means[order(means$x),] #基于计算出来的均值列,升序
barplot(means$x,names.arg = means$Group.1)
title("Mean Illiteracy Rate")
#条形图的微调
par(mar=c(5,8,4,2))
par(las =2 )
counts <- table(Arthritis$Improved)
barplot(counts,
main = "Treatment Outcome",
horiz = TRUE,
cex.names = 0.8,
names.arg = c("No Improvement","Some Improvement",
"Marked Improvement")
)
#棘状图,纵向画图时,两个条形块登高,里面基于数据用不同的颜色标注
library(vcd)
attach(Arthritis)
counts <- table(Treatment,Improved)
spine(counts,main="Spinogram Example")
detach(Arthritis)
#饼图,与扇形图相比没有叠加部分
#图一,简单饼图,数值向量画图
par(mfrow=c(2,2)) #将画布进行整体的四图布局
slices <- c(10,12,4,16,8) #饼图中数值块
lbls <- c("US","UK","Australia","Germany","France")
pie(slices,labels=lbls,main="Simple Pie Chart")
#图二,简单饼图,百分数
pct <- round(slices/sum(slices)*100) #转换原始数值向量到百分占比向量
lbls2 <- paste(lbls," ",pct,"%",sep="") #US 20%,定制标签
#lbls2 <- paste(lbls,pct,"%",sep=" ") #"US 20 %,两两之间均为空格
pie(pct,labels=lbls2,main = "Pie Chart with Percentages")
#图三,3D图,数值向量画图
library(plotrix)
pie3D(slices,labels=lbls,explode = 0.1,
main="3D Pie Chart")
#图四,表格数据
mytable <- table(state.region)
class(mytable) #返回表格
lbls3 <- paste(names(mytable),"\n",mytable,sep="") #Northeast\n9,\n不能解析
pie(mytable,labels = lbls3,main="Pie Chart from a Table\n (with sample sizes)") #\n能解析
#扇形图,各个扇形相互叠加,宽度重要,半径并不重要
library(plotrix)
slices <- c(10,12,4,16,8)
lbls <- c("US","UK","Australia","Germany","France")
fan.plot(slices,labels = lbls,main="Fan Plot")
par(mfrow=c(1,1)) #回退到1副图布局的状态
#直方图
par(mfrow=c(2,2))
hist(mtcars$mpg) #直方图一,频数直方图
hist(mtcars$mpg,breaks = 12,col="red",
xlab="Miles Per Gallon",
main="Colored histogram with 12 bins"
) #直方图二,频数直方图,分了12组
#图三,组合图
hist(mtcars$mpg,
freq = FALSE,breaks = 12,col="red",
xlab = "Miles Per Gallon",
main = "Histogram,rug plot,density curve")#直方图三,频率直方图
rug(jitter(mtcars$mpg)) #底部增加轴须图
lines(density(mtcars$mpg),col="blue",lwd=2) #上部增加概率密度函数
#图四,组合图
x <- mtcars$mpg
h <- hist(x,
breaks = 12,
col = "red",
xlab = "Miles Per Gallon",
main = "Histogram with normal curve and box")
xfit <- seq(min(x),max(x),length=40) #10.4~33.9之间,生成40个值,等差0.6025641
diff(xfit) #0.6025641,确定为等差数列
#dnorm(x, mean = 0, sd = 1, log = FALSE) 的返回值是正态分布概率密度函数值,
#比如dnorm(z)则表示:标准正态分布密度函数f(x)在x=z处的函数值。
yfit <- dnorm(xfit,mean=mean(x),sd=sd(x))
#基于现有直方图的间隔,变量数量,改造Y值
yfit <- yfit * diff(h$mids[1:2])*length(x)
lines(xfit,yfit,col="blue",lwd=2)
box() #给图形,增加边框
#核密度图
#图一,最简单的图
par(mfrow=c(2,1))
d <- density(mtcars$mpg) #核密度图为观察连续型变量分布的有效方法
plot(d)
#图二,
d <- density(mtcars$mpg)
plot(d,main="Kernel Density of Miles Per Gallon")
#polygon函数根据顶点的x,y坐标绘制了多边形,线下方填充为红色,线边框为蓝色
polygon(d,col="red",border="blue")
#坐标轴下方添加轴须图
rug(mtcars$mpg,col="brown")
#可比较的核密度图,sm包,同时画相关的多条核密度图
#核密度图的叠加不失为一种在某个结果变量上跨组比较观测的强大方法
install.packages("sm")
library("sm")
#准备数据源
attach(mtcars)
cyl.f <- factor(cyl,levels=c(4,6,8),
labels = c("4 cylinder","6 cylinder","8 cylinder"))
str(cyl.f) #查看因子结构
#准备画图,mpg是数值(对应x轴),cyl是分组变量(对应y轴)
sm.density.compare(mpg,cyl,xlab="Miles Per Gallon")
title(main="MPG Distribution by Car Cylinders")
#levels(cyl.f),返回"4 cylinder" "6 cylinder" "8 cylinder"
#length(levels(cyl.f)),返回3,共计3个水平
#保持3个水平,从2开始,2,3,4,所以有2:4
colfill <- c(2:(1+length(levels(cyl.f)))) #返回2,3,4
#因子转换仅为提供标签,colfill提供颜色向量
legend(locator(1),levels(cyl.f),fill=colfill)#鼠标互动,动态添加图例
detach(mtcars)
#箱线图
#又称盒须图,通过绘制连续型变量的五数总括,即最小值,下四分位数(第25百分位数),
#中位数(第50百分位数),上四分位数(第75百分位数),以及最大值,描述了连续型变量的分布
#IQR=上四分位数-下四分位数,表示四分位距
#范围-1.5*IQR~1.5*IQR,为正常值,范围以外的为离群值
#示范一,单箱线图
#mpg,每加仑英里数
boxplot(mtcars$mpg,main="Box plot",ylab="Miles per Gallon")
#参数说明
#stats 返回连续变量的五数:min,25百分位数,中位数,75百分位数,max
#10.40(min),15.35(25百分位数),19.20(中位数),22.80(75百分位数),33.90(max)
#IQR=75百分位数-25百分位数=22.80-15.35= 7.45,说明50%的数据落于(15.35,22.80)
#(-1.5IQR,1.5IQR)=(-11.175,11.175)
boxplot.stats(mtcars$mpg)
#示范二,多箱线图,比较
#mpg,数值,对应y轴
#cyl, 分组,对应x轴
#y~A, 表示将类别变量A的每一个值并列的生成数值型变量y的箱线图
#y~A*B 将类别型变量A和B所有水平的两两组合生成数值型变量y的箱线图
boxplot(mpg~cyl,data=mtcars,
main="Car Mileage Data",
xlab="Number of Cylinders",
ylab="Miles Per Gallon"
)
par(las=1) #整体变量,同时控制x轴,y轴
#示范三,凹槽箱线图
#notch=TRUE,得到凹槽的箱线图
#varwidth=TRUE,箱线图的宽度与其样本大小的平方根成正比
boxplot(mpg~cyl,data=mtcars,
notch=TRUE,
varwidth=TRUE,
col="red",
main="Car Mileage Data",
xlab="Number of Cylinders",
ylab="Miles Per Gallon"
)
#示范四,两个交叉因子的箱线图
#创建气缸数量的因子
mtcars$cyl.f <- factor(mtcars$cyl,levels=c(4,6,8),labels=c("4","6","8"))
#创建变速箱类型的因子
mtcars$am.f <- factor(mtcars$am,levels=c(0,1),labels=c("auto","standard"))
#3个因子*2个因子=6个组合
boxplot(mpg~am.f*cyl.f,
data=mtcars,
varwidth=TRUE,
col=c("gold","darkgreen"),
main="MPG Distribution by Auto Type",
xlab="Auto Type",
ylab="Miles Per Gallon"
)
#小提琴图,箱线图和核密度图的结合
#范例一
install.packages("vioplot")
library("vioplot")
#列后面的条件,不用加逗号,因为只剩下一个维度
x1 <- mtcars$mpg[mtcars$cyl == 4] #将cyl=4时,mpg对应的多个值抽取成一个向量
x2 <- mtcars$mpg[mtcars$cyl == 6] #将cyl=6时,mpg对应的多个值抽取成一个向量
x3 <- mtcars$mpg[mtcars$cyl == 8] #将cyl=8时,mpg对应的多个值抽取成一个向量
#小提琴图,外框为核密度图,整个形状是箱线图
#小提琴图,核密度图以镜像方式在箱线图上的叠加
#白点是中位数,黑色盒型的范围是下四分位数到上四分位数,细黑线表示须
#col=gold, 单值,不够用时,循环使用
vioplot(x1,x2,x3,
names=c("4 cyl","6 cyl","8 cyl"),
col="gold"
)
title("Violin Plots of Miles Per Gallon",
ylab="Miles Per Gallon",
xlab="Number of Cylinders"
)
#点图
#范例一,原始点图
dotchart(mtcars$mpg,labels=row.names(mtcars),cex=0.7,
main="Gas Mileage for Car Models",
xlab="Miles Per Gallon"
)
#范例二,装饰后的点图
#根据mpg对mtcars进行升序排序,重新存储,数据框两维,所以加逗号
x <- mtcars[order(mtcars$mpg),]
#将cyl列,转化成因子列
x$cyl <- factor(x$cyl)
#为cyl不同level,设定不同颜色
x$color[x$cyl == 4] <- "red"
x$color[x$cyl == 6] <- "blue"
x$color[x$cyl == 8] <- "darkgreen"
#画点图,随着数据点的增多,点图的实用性随之下降
#row.names(x),数据框的行名,作为标识ID
#groups = x$cyl,数据点根据气缸数量分组
#gcolor = "black",数字4,6,8以黑色显示
#color = x$color,点和标签的颜色,根据自身给定的颜色向量
dotchart(x$mpg,
labels=row.names(x),
cex=0.7,
groups=x$cyl,
gcolor="black",
color=x$color,
pch=19,
main="Gas Mileage for Car Models\ngrouped by cylinder",
xlab="Miles Per Gallon"
)