小白读《R语言实战》写的读书笔记（第六章）

本文链接：https://blog.csdn.net/John_AWZ/article/details/136504577

#####第六章：基本图形#####
####1、条形图####
#通过垂直或水平的条形展示了分类变量的分部（频数）
#通过ggplot2包创建，代码如下：
ggplot(data, aes(x = catvar) + geom_bar())
#其中data为数据框，catvar是一个分类变量
#本章数据用vcd包中的Arthritis, 数据框中的数据，须提前安装vcd包
install.packages("vcd")
library(grid, vcd)

####1.1简单的条形图
#在关节炎的治疗中，变量Improved记录了对每位接受了安慰剂或药物的病人的治疗效果
data("Arthritis", package = "vcd")
table(Arthritis$Improved)
#第7章会详细讨论table()函数提取变量按值计数的方法
#代码清单6-1绘制垂直条形图或水平条形图
library(ggplot2)
ggplot(Arthritis, aes(x = Improved)) + geom_bar() +
labs(title = "Simple Bar chart",
x = "Improvement",
y = "Frequency")
#以上为简单条形图
ggplot(Arthritis, aes(x = Improved)) + geom_bar() +
labs(title = "Horizontal Bar chart",
x = "Improvement",
y = "Frequency") +
coord_flip()
#以上为水平条形图
#如果标签很长会在6.1.4中讲解

####1.2堆积、分组和填充条形图
#关节炎新疗法研究的核心问题是：使用安慰剂和药物治疗这两种方式对疾病的改善有何差异？
#可以使用table()函数来生成交叉表
table(Arthritis$Improved, Arthritis$Treatment)
#虽然这个表很有用，但使用条形图更易于理解
#两个分类变量间的关系可使用堆积条形图、分组条形图
#代码清单6-2如下
library(ggplot2)
ggplot(Arthritis, aes(x = Treatment, fill = Improved)) +
geom_bar(position = "stack") +
labs(title = "Stacked Bar chart",
x = "Treatment",
y = "Frequency")
#以上为堆积条形图(数据像是交叉表：john注；总数据时两组的总数，每个颜色是每一组的频数)
ggplot(Arthritis, aes(x = Treatment, fill = Improved)) +
geom_bar(position = "dodge") +
labs(title = "Grouped Bar chart",
x = "Treatment",
y = "Frequency")
#以上为分组条形图
ggplot(Arthritis, aes(x = Treatment, fill = Improved)) +
geom_bar(position = "fill") +
labs(title = "Filled Bar chart",
x = "Treatment",
y = "Proportion")
#以善为填充条形图（总数是100%，每种颜色代表一个分类的频数）
#在比较一个分类变量的各水平在另一个分类变量各水平中的占比时，天蛀虫条形图非常有用
#geom_bar()函数中position参数控制了条形图的位置关系
#包含dodge（默认值）、stack（堆积条形图）、fill（填充条形图）、dodge2（类似于dodge，但多了控制选项）和自定义函数

####1.3均值条形图
#条形图不一定要基于计数数据或频数数据
#可以通过使用合适的统计量汇总数据并将结果传递给ggplot2
#从而创建表示均值、中位数、百分比、标准差等条形图
#代码6-3使用state.region数据集中的state.x77数据绘制美国各个地区的平均文盲率
#代码如下
install.packages("dplyr")
library(dplyr)
states <- data.frame(state.region, state.x77)
plotdata <- states %>%
group_by(state.region) %>%
summarize(mean = mean(Illiteracy))
plotdata
#此步骤是生成各个地区的均值，计算每个地区的平均文盲率
#如果不用管道运算符可以写成plotdata <- summarize(group_by(states, state.region), mean = mean(Illiteracy))
ggplot(plotdata, aes(x = reorder(state.region, mean), y = mean)) +
geom_bar(stat = "identity") +
labs(x = "Region",
y = " ",
title = "Mean Illiteracy Rate")
#以上是使用排序条形图表示均值
#geom()计算并绘制单元格计数，stat = "identity"选项可强制此函数绘制所提供的数（本例中的均值）
#使用reorder()对条形图按照平均文盲率进行升序排列，将state.region按照mean升序排列
#图6-4添加了表示均值标准差的误差线
#代码清单6-4如下：
plotdata <- states %>%
group_by(state.region) %>%
summarize(n = n(), #n = n，第2个n是state.region中的观察数目，在本例中表示有几个地区，第1个n是将这个结果新建一个名称为n的列
mean = mean(Illiteracy),
se = sd(Illiteracy)/sqrt(n)) #计算标准误，sqrt(n)代表根号n，标准误=标准差/根号样本量
plotdata
#以上根据地区计算均值和标准差
ggplot(plotdata, aes(x = reorder(state.region, mean), y = mean)) + #绘制均值的排序条形图
geom_bar(stat = "identity", fill = "skyblue") +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.2) + #添加误差线
labs(x = "Region",
y = " ",
title = "Mean Illiteracy Reat",
subtitle = "with standard error bar")

####1.4条形图的微调
##1、条形图的颜色
#geom_bar()中fill="color"指定了区域的颜色，color="color"指定了边框的颜色
#通常情况下，ggplot2使用fill指定具有区域的几何对象，比如条形图、扇形图、方格图
#而color指定没有区域的几何对象，如线、点和边框
##代码如下：
data(Arthritis, package = "vcd")
ggplot(Arthritis, aes(x = Improved)) +
geom_bar(fill = "gold", color = "black") +
labs(title = "Treament Outcom")
#在这段代码中，代码指定的是单一颜色，颜色可以映射到分类变量的层级中，代码如下：
ggplot(Arthritis, aes(x = Treatment, fill = Improved)) + #这里要求按照fill来填充颜色，颜色为Improved的分组
geom_bar(position = "stack", color = "black") +
scale_fill_manual(values = c("red", "grey", "gold")) +
labs(title = "stacked Bar chart",
x = "Treatment",
y = "Frequency")
#Improved中将治疗效果分为None，Some，Marked，而scale_fill_manual（）则是针对这个分级进行填色

##2、条形图的标签
#当数据条很多或者标签很长时，条形图的标签可能会重叠而影响阅读
#以以下代码为例
ggplot(mpg, aes(x = model)) +
geom_bar() +
labs(title = "Car Models in the mpg dataset",
y = "Frequency", x = " ")
#首先可以将数据绘制成水平条形图
ggplot(mpg, aes(x = model)) +
geom_bar() +
labs(title = "Car Models in the mpg dataset",
y = "Frequency", x = " ") +
coord_flip()
#其次可以使标签倾斜并使用较小的字体
ggplot(mpg, aes(x = model)) +
geom_bar() +
labs(title = "Car Models in the mpg dataset",
y = "Frequency", x = " ") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
#这种图像的微调手段在19章会再次讲解

####2、饼图####
#用pie()来创建饼图，但这个函数的功能有限
#因此作者制作了一个名为ggpie的包，以下是该包的安装代码。
install.packages("ggpie")
#如果这个没办法安装，可以使用下面的代码
library(remotes)
if(!require(remotes)) install.packages("remotes")
remotes::install_gitlab("rkabacoff/ggpie")
#这里按照这个代码我无法安装
#基本的语法是：
ggpie(data, x, by, offset, percent, legend, title)
#data是一个数据框
#x是要绘制的分类变量
#by是可选的第二分类变量，设置后会生成此变量的各水平的饼图
#offset是扇形标签到饼图中心的距离，值为0.5时标签位于扇形中心，大于1.0时标签在扇形外面
#percent是逻辑型变量，如为FALSE则不输出百分比
#legend是逻辑型变量，如为FALSE则省略图例
#title是标题选项
##以下是6-10代码
library(ggplot2)
library(ggpie)
ggpie(mpg, class)
##John注：这里我报错了,查询了帮助文件后更改一下代码，但是和书中的结果不甚相同
as.character(mpg$class)
ggpie(data = mpg, group_key = "class", count_type = "full")
#下一个版本中，图例被删除了，且每个扇形都带上了标签，标签被放置在扇形外
ggpie(mpg, class, legend = FALSE, offset = 1.3,
title = "Automobiles by Car Class")
#这段书中的代码我无法运行
#最后一段代码展示了车型的年分布情况图
ggpie(mpg, class, year,
legend = FALSE, offset = 1.3,
title = "Car Class by Year")
#我同样无法运行

####3、树形图####
#树形图用与变量水平成霹雳的矩形来显示分类变量的分部
#可以使用treemapify包来实现
install.packages("treemapify")
#代码清单6-5展示mpg数据框中汽车厂商分布情况的树形图
library(ggplot2)
library(dplyr)
library(treemapify)
plotdata <- mpg %>% count(manufacturer) #对数据进行描述性统计
ggplot(plotdata, #创建树形图
aes(fill = manufacturer, #fill是分类变量
area = n, #area是每个水平的数量
label = manufacturer)) + #label是选项变量，用于添加标签
geom_treemap() + #创建树形图
geom_treemap_text() + #添加树形图标签
theme(legend.position = "none") #删除图例
#首先要计算manufacurer中每个变量的频数
#得到的数据传递给ggplot2以创建图形
##下一段代码添加了第2个变量：drivetrain，用来绘制各汽车厂商生产的前轮驱动、后轮驱动和四轮驱动的数量情况
library(ggplot2)
library(dplyr)
library(treemapify)
plotdata <- mpg %>%
count(manufacturer, drv) #按照manufacturer和drv两个变量计算频数，并传递给plotdata
plotdata$drv <- factor(plotdata$drv,
levels = c("4", "f", "r"), #定义drv中的标签
labels = c("4-wheel", "front-wheel", "rear"))
ggplot(plotdata,
aes(fill = manufacturer,
area = n,
label = manufacturer,
subgroup = drv)) + #用于创建drv各自的分组图
geom_treemap() +
geom_treemap_subgroup_border() + #对分组图添加边框
geom_treemap_subgroup_text( #对分组图添加标签
place = "middle",
colour = "black",
alpha = 0.5, #标签透明度0.5
grow = FALSE) + #文本字体大小保持不变，而不是增大并填充区域
geom_treemap_text(colour = "white",
place = "centre",
grow = FALSE) +
theme(legend.position = "none")

####4、直方图####
#直方图通过在x轴上的值域分割为一定数量的数据桶，在y轴上显示相应值的频数，展示了连续变量的分部
#函数表达如下：
ggplot(data, aes(x = contvar)) + geom_histogram()
#data是数据框，contvar是一个连续变量
#代码清单6-7用mpg数据框分析了2008年117个汽车配置的每加仑汽油形式英里数的分部情况
#代码如下：
library(ggplot2)
library(scales)
data(mpg)
cars2008 <- mpg[mpg$year == 2008, ]

ggplot(cars2008, aes(x = cty)) +
geom_histogram() +
labs(title = "Default histogram")
#以上为简单直方图
ggplot(cars2008, aes(x = hwy)) +
geom_histogram(bins = 20, color = "white", fill = "steelblue") +
labs(title = "Colored histogran with 20 bins",
x = "City Miles Per Gallon",
y = "Frequency")
#以上为带有20个数据桶的彩色直方图（和书中的图不太一样，John注）
#（书中翻译为数据桶，代码中为bins，John注）
ggplot(cars2008, aes(x = hwy, y = ..density..)) + #制定y轴为density
geom_histogram(bins = 20, color = "white", fill = "steelblue") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Histogram with Percentages",
x = "Percent",
y = "City Miles Per Gallon")
#以上是带有百分比的直方图
ggplot(cars2008, aes(x = hwy, y = ..density..)) +
geom_histogram(bins = 20, color = "white", fill = "steelblue") +
scale_y_continuous(labels = scales::percent) +
geom_density(color = "red", size = 1) +
labs(title = "Histogram with Density curve",
x = "Percent",
y = "Highway Miles Per Gallon")
#以上为带有核密度曲线的直方图(size报错了，这里的选项变成了linewidth，但仍能执行出来,John注)
#(这里我将R更新到4.3.3，所以页面显示有些不一样了，John注)



####5、核密度图####
#核密度图是估计随机变量概率密度函数的一种非参数方法。
#其格式如下：
ggplot(data, aes(x = contar)) + geom_density()
#ata是数据框，contvar是一个连续变量
#代码清单6-8再次绘制了2008年汽车的每加仑汽油行驶英里数分布图，代码如下：
library(ggplot2)
data(mpg)
cars2008 <- mpg[mpg$year == 2008,]

ggplot(cars2008, aes(x = cty)) +
geom_density() +
labs(title = "Default kernel density plot")
#以上为缺省的核密度图
ggplot(cars2008, aes(x = cty)) +
geom_density(fill = "red") +
labs(title = "Default kernel density plot",
x = "City Miles Per Gallon")
#以上为填充核密度图
bw.nrd0(cars2008$cty)
#以上为打印默认带宽（带宽越大，曲线越平滑）
ggplot(cars2008, aes(x = cty)) +
geom_density(fill = "red", bw = .5 ) +
labs(title = "kernel density plot with bw=0.5",
x = "City Miles Per Gallon")
#以上为小带宽核密度图
#核密度图可以进行组间比较
#代码清单6-9比较了2008年四缸车、六缸车和八钢车每加仑汽油行驶英里数的估计值
#五缸车很少，所有在分析中要剔除数据
#代码如下
data(mpg, package = "ggplot2")
cars2008 <- mpg[mpg$year == 2008 & mpg$cyl != 5,]
cars2008$Cylinders <- factor(cars2008$cyl) #将创建一个名为Cylinders的列，并将其变量转为因子
#以上为准备数据
ggplot(cars2008, aes(x = cty, color = Cylinders, linetype = Cylinders)) +
geom_density(linewidth = 1) + #（这里的size被我换成了linewidth，其性质是完全一样的，John注）
labs(title = "Fuel Efficiecy by Number of Cylinders",
x = "City Miles per Gallon")
#以上绘制和密度曲线
ggplot(cars2008, aes(x = cty, fill = Cylinders)) +
geom_density(alpha = .4) +
labs(title = "Fuel Efficiecy by Number of Cylinders",
x = "City Miles per Gallon")
#以上为绘制填充核$Cylinders
#ggplot2中的图像在黑白打印机打印出来其实是很难区分的，具体解决方法详见书第129页。


####6、线箱图####
#箱线图（又称盒须图）通过绘制连续型变量的5个统计量：
#即最小值、下四分位数（第25百分位数）、中位数（第50百分位数）、上四分位数（第75百分位数）以及最大值，描述了连续型变量的分布
#书中图6-19代码如下：
ggplot(mtcars, aes(x = "", y = mpg)) +
geom_boxplot() +
labs(y = "Miles Per Gallon", x = "", title = "Box Plot")
#(结果和图6-19有些许出入，John注)
boxplot.stats(mtcars$mpg)

####6.6.1使用并列箱线图进行跨组比较
#箱线图是比较根据分类变量各水平分组的定量变量分布的有效方法。
#书中再次比较四缸六缸和八缸汽车每加仑汽油行驶英里数，但是这次我们将使用1999年和2008年的数据。
#因为五缸车很少，所以删除五缸车的数据。
#我们还要将year和cy1从连续型数值变量转化为分类（分组）因子：
#其代码如下：
library(ggplot2)
cars <- mpg[mpg$cyl != 5,]
cars$Cylinders <- factor(cars$cyl)
cars$Year <- factor(cars$year)
#先处理数据
ggplot(cars, aes(x = Cylinders, y = cty)) +
geom_boxplot() +
labs(x = "Mumber of Cylinders",
y = "Miles Per Gallon",
title = "Car Mileage Data")
#下面代码添加一个含凹槽的线箱图
ggplot(cars, aes(x = Cylinders, y = cty)) +
geom_boxplot(notch = TRUE,
fill = "steelblue",
varwidth = TRUE) + #调整线箱图宽度为与该组样本量的平方根成正比
labs(x = "Mumber of Cylinders",
y = "Miles Per Gallon",
title = "Car Mileage Data")
#最后，书中以多个分组因子绘制箱线图。
#以下代码提供了不同年份不同汽缸数量车型每加仑汽油行驶英里数的箱线图(图6-21所示)。
#代码中添加了函数scale_fill_manual()用于自定义填充颜色：
ggplot(cars, aes(x = Cylinders, y = cty, fill = Year)) +
geom_boxplot() +
labs(x = "Mumber of Cylinders",
y = "Miles Per Gallon",
title = "Car Mileage by # Cylinders and Year") +
scale_fill_manual(values = c("gold", "green"))

####6.6.2小提琴图
#小提琴图是箱线图与核密度图的结合。
#可以使用函数geom_violin()绘制它。
#在代码清单6-10中，在箱线图中添加小提琴图，绘图结果如图6-23所示。
library(ggplot2)
cars <- mpg[mpg$cyl != 5,]
cars$Cylinders <- factor(cars$cyl)

ggplot(cars, aes(x = Cylinders, y = cty)) +
geom_boxplot(width = .2, #线箱图的宽度设置为0.2，可以放到小提起图里
fill = "green") +
geom_violin(fill = "gold",
alpha = 0.3) + #小提琴图的透明度设置为0.3，这样就能看到线箱图
labs(x = "Mumber of Cylinders",
y = "Miles Per Gallon",
title = "Vilon Plots of Miles Per Gallonr")
#小提琴图基本上是核密度图以镜像方式在箱线图上的叠加。
#在图中，中间线是中位数，黑色盒子的范围是下四分位点到上四分位点，细黑线表示须。
#点表示离群值。
#外部形状即为核密度图。
#从图上可知八缸车的分布可能是双峰型的，这是单独使用箱线图时看不出来的。


####7、点图####
#点图提供了一种在简单水平刻度上绘制大量标签值的方法。
#用函数dotchart()来创建点图，其格式如下：
ggplot(data, aes(x = contvar, y = catvar)) + geom_point()
#data是数据框，contvar是一个连续变量,catvar是一个分类变量
#以下示例使用的是mpg数据集中2008年各车型的每加仓汽油高速公路行驶英里数。
#每加仓汽油高速公路行驶英里数取每种车型的平均值。
#代码如下所示：
library(ggplot2)
library(dplyr)
plotdata <- mpg %>%
filter(year == "2008") %>%
group_by(model) %>%
summarize(meanHwy = mean(hwy))
plotdata
#以上为准备数据阶段
ggplot(plotdata, aes(x = meanHwy, y = model)) +
geom_point() +
labs(x = "Niles Per Gallon",
y = "",
title = "Gas Mileage for Car Model")
#以下代码对每加仑汽油行驶英里数从低到高对车型进行排序
ggplot(plotdata, aes(x = meanHwy, y = reorder(model, meanHwy))) +
geom_point() +
labs(x = "Niles Per Gallon",
y = "",
title = "Gas Mileage for Car Model")