R语言数据分析实战-CSDN博客

文章目录

- R语言数据分析120题

R语言数据分析120题

library(tidyverse)
library(zoo)
library(writexl)

P1-20

1（创建数据框）：将下面的字典创建为DataFrame

data = {“grammer”: [“Python”,“C”,“Java”,“GO”,np.nan,“SQL”,“PHP”,“Python”], “score”:[1,2,np.nan,4,5,6,7,10]}

library(tibble)
df <- tibble( 
grammer = c("Python","C","Java","GO", NA,"SQL","PHP","Python"), 
    score = c(1,2,NA,4,5,6,7,10)
 )
df

A tibble: 8 × 2
grammer	score
<chr>	<dbl>
Python	1
C	2
Java	NA
GO	4
NA	5
SQL	6
PHP	7
Python	10

df_1 <- data.frame( 
grammer = c("Python","C","Java","GO", NA,"SQL","PHP","Python"), 
    score = c(1,2,NA,4,5,6,7,10)
 )
df_1

A data.frame: 8 × 2
grammer	score
<chr>	<dbl>
Python	1
C	2
Java	NA
GO	4
NA	5
SQL	6
PHP	7
Python	10

2（筛选行）：提取含有字符串"Python"的行

整个

df %>% 
    filter(grammer == "Python")

A tibble: 2 × 2
grammer	score
<chr>	<dbl>
Python	1
Python	10

包含部分字符

R语言使用dplyr提取行列：https://www.cnblogs.com/chenwenyan/p/15261246.html

df_1 <- dplyr::filter(df, grepl('P', grammer))
df_1

A tibble: 3 × 2
grammer	score
<chr>	<dbl>
Python	1
PHP	7
Python	10

#sessionInfo()  查看R版本

#R.Version()

3（查看列名）：输出df的所有列名

names(df)

'grammer'
'score'

4（修改列名）：修改第二列列名为popularity

重命名函数rename()

df = df %>% rename(popularity = score)
names(df)

'grammer'
'popularity'

5（统计频数）：统计grammer列中每一种编程语言出现的次数

计数函数count()

df %>% count(grammer)

A tibble: 7 × 2
grammer	n
<chr>	<int>
C	1
GO	1
Java	1
PHP	1
Python	2
SQL	1
NA	1

6（缺失值处理）：将空值用上下值的平均值填充

https://blog.csdn.net/weixin_51077152/article/details/119945978

(1)判断元素中是否含有缺失值：is.na()函数

(2)去掉数据集中的缺失值（行删除）：
complete.cases() 存储没有缺失值的数据框

na.omit（）函数去掉有缺失值的行
na.omit(object,cols,invert)
object::数据集
cols:一个由列名(或数字)组成的向量，用于检查缺失值。默认值是所有列。
invert:如果FALSE将省略所有缺少值的行(默认)。TRUE只返回那些缺少值的行。

(3)计算每一列/每一行的缺失值数，用sleep数据集
sleep数据集引用之前需要先下载程序包VIM

na.approx (in zoo package):计算插值替换NA值

df = df %>% mutate(popularity = zoo::na.approx(popularity))
df

A tibble: 8 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
Java	3
GO	4
NA	5
SQL	6
PHP	7
Python	10

mutate函数，关于mutate函数补充如下：

# 删除grammer列的数据，即将null值赋予列
df %>%
  mutate(grammer=NULL)

A tibble: 8 × 1
popularity
<dbl>
1
2
3
4
5
6
7
10

7（筛选行）：提取popularity列中值大于3的行

df %>% filter(popularity >3)

A tibble: 5 × 2
grammer	popularity
<chr>	<dbl>
GO	4
NA	5
SQL	6
PHP	7
Python	10

8（数据去重）：按grammer列进行去重

distinct()过滤多余的重复记录只保留一条
.keep_all = TRUE显示所有的变量, = FALSE只显示降重列

df %>% distinct(grammer, .keep_all = TRUE)

A tibble: 7 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
Java	3
GO	4
NA	5
SQL	6
PHP	7

9（数据计算）：计算popularity列平均值

df %>%
  summarise(popularity_avg = mean(popularity))
  # summarise()创建新的data frame
  # 可以实现分类汇总

A tibble: 1 × 1
popularity_avg
<dbl>
4.75

10（格式转换）：将grammer列转换为序列

df$grammer

'Python'
'C'
'Java'
'GO'
NA
'SQL'
'PHP'
'Python'

11（数据保存）：将数据框保存为Excel

writexl::write_xlsx(df, "filename.xlsx")

12（数据查看）：查看数据的行数列数

dim(df)

13（筛选行）：提取popularity列值大于3小于7的行

df %>%
  filter(popularity > 3 & popularity < 7)

A tibble: 3 × 2
grammer	popularity
<chr>	<dbl>
GO	4
NA	5
SQL	6

14（调整列位置）：交互两列的位置

df %>%
  select(popularity, grammer)
  # select()仅保留提到的变量

A tibble: 8 × 2
popularity	grammer
<dbl>	<chr>
1	Python
2	C
3	Java
4	GO
5	NA
6	SQL
7	PHP
10	Python

15（筛选行）：提取popularity列最大值所在的行

df %>%
  filter(popularity == max(popularity))

A tibble: 1 × 2
grammer	popularity
<chr>	<dbl>
Python	10

16（查看数据）：查看最后几行数据

tail(df)
head(df) 
# tail默认最后6行，head默认前6行

A tibble: 6 × 2
grammer	popularity
<chr>	<dbl>
Java	3
GO	4
NA	5
SQL	6
PHP	7
Python	10

A tibble: 6 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
Java	3
GO	4
NA	5
SQL	6

17（修改数据）：删除最后一行数据

df %>% 
  slice(-n())
  # 删除第n行的数据，即最后一行的数据

A tibble: 7 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
Java	3
GO	4
NA	5
SQL	6
PHP	7

在这里用的是slice函数，关于slice函数补充如下：

# 如果选择某一特定行数据，例如选取第三行：
df %>% 
  slice(3)
# 如果删去某一特定行数据，例如删去第三行：
df %>% 
  slice(-3)
# 如果选择特定几行数据，例如选取第3-5行：
df %>% 
  slice(3:5)
# 如果删去特定几行数据，例如删除第3-5行：
df %>% 
  slice(-(3:5))

A tibble: 1 × 2
grammer	popularity
<chr>	<dbl>
Java	3

A tibble: 7 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
GO	4
NA	5
SQL	6
PHP	7
Python	10

A tibble: 3 × 2
grammer	popularity
<chr>	<dbl>
Java	3
GO	4
NA	5

A tibble: 5 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
SQL	6
PHP	7
Python	10

18（修改数据）：添加一行数据：“Perl”, 6

newrow = tibble(grammer="Perl", popularity=6)
  # 定义新的newrow，a tibble：1*2
df %>%
  bind_rows(newrow)
  # 使用bind_rows()，合并两个tibble

A tibble: 9 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
Java	3
GO	4
NA	5
SQL	6
PHP	7
Python	10
Perl	6

19（数据整理）：对数据按popularity列值从大到小排序

df %>%
  arrange(desc(popularity))

A tibble: 8 × 2
grammer	popularity
<chr>	<dbl>
Python	10
PHP	7
SQL	6
NA	5
GO	4
Java	3
C	2
Python	1

arrange函数（排序函数） :
arrange(desc(data)) 从大到小
arrange(data) 从小到大

# 对数据按popularity列值从到小到大排序
df %>%
  arrange(popularity)

A tibble: 8 × 2
grammer	popularity
<chr>	<dbl>
Python	1
C	2
Java	3
GO	4
NA	5
SQL	6
PHP	7
Python	10

20（字符统计）：统计grammer列每个字符串的长度

mutate()函数对数据进行增加列

df %>%
  mutate(strlen = str_length(grammer))
  # 使用mutate()函数对数据进行增加列
  # 定义strlen为新列，赋值为grammer的字符串长度

A tibble: 8 × 3
grammer	popularity	strlen
<chr>	<dbl>	<int>
Python	1	6
C	2	1
Java	3	4
GO	4	2
NA	5	NA
SQL	6	3
PHP	7	3
Python	10	6

P21-30

21（读取数据）：读取本地Excel数据

df = readxl::read_xlsx("./data_R/21-50数据.xlsx")
# 需要保证数据在当前路径，可通过setwd()设置

22（查看数据）：查看df数据的前几行（后几行）

head(df)
#tail(df)

A tibble: 6 × 3
createTime	education	salary
<dttm>	<chr>	<chr>
2020-03-16 11:30:18	本科	20k-35k
2020-03-16 10:58:48	本科	20k-40k
2020-03-16 10:46:39	不限	20k-35k
2020-03-16 10:45:44	本科	13k-20k
2020-03-16 10:20:41	本科	10k-20k
2020-03-16 10:33:48	本科	10k-18k

23（数据计算）：将salary列数据转换为最大值与最小值的平均值

df = df %>%
    separate(salary, into = c("low", "high"), sep = "-")     
    # 先将salary列分成两列low和high
df = df %>%   
    mutate(salary = (parse_number(low) + parse_number(high)) * 1000 / 2)
    # 将low和high两列的数字部分运算，赋值给salary
    # parse_number()指提取变量中的数字部分
df = df %>%
  select(-c(low, high))
    # 删除low和high列
    # 也可以用#20的mutate函数删除列

head(df)

A tibble: 6 × 3
createTime	education	salary
<dttm>	<chr>	<dbl>
2020-03-16 11:30:18	本科	27500
2020-03-16 10:58:48	本科	30000
2020-03-16 10:46:39	不限	27500
2020-03-16 10:45:44	本科	16500
2020-03-16 10:20:41	本科	15000
2020-03-16 10:33:48	本科	14000

24（分组汇总）：根据学历分组，并计算平均薪资

df %>% 
  group_by(education) %>% summarise(salary_avg = mean(salary))
    # 将数据中的education相同取值的数据进行分组聚类，tiblle属性包括有Groups: education [4]
      # 创建新的数据框data frame，包含有group，salary_avg

A tibble: 4 × 2
education	salary_avg
<chr>	<dbl>
本科	19361.34
不限	19600.00
大专	10000.00
硕士	20642.86

summarise函数（聚合函数），关于summarise函数补充如下：

# 创建新的数据框data frame，命名为salary，包含有group，salary_avg，salary_max，salary_min
salary=df %>% 
  group_by(education) %>% 
  summarise(salary_avg = mean(salary),salary_max=max(salary),salary_min=min(salary))

salary

A tibble: 4 × 4
education	salary_avg	salary_max	salary_min
<chr>	<dbl>	<dbl>	<dbl>
本科	19361.34	45000	3500
不限	19600.00	30000	3500
大专	10000.00	15000	5000
硕士	20642.86	37500	12500

25（时间转换）：将createTime列转换为"月-日"

library(lubridate)

head(
    
df %>% 
  mutate(createTime = str_c(month(createTime), "-", day(createTime)))
)

  # 增加列createTime，包含有月份-日期
  # str_c合并字符串

A tibble: 6 × 3
createTime	education	salary
<chr>	<chr>	<dbl>
3-16	本科	27500
3-16	本科	30000
3-16	不限	27500
3-16	本科	16500
3-16	本科	15000
3-16	本科	14000

26（查看数据）：查看数据结构信息

df %>% 
  glimpse()
  # 查看数据结构信息
object.size(df)
  # 查看对象占用内存

Rows: 135
Columns: 3
$ createTime [3m[90m<dttm>[39m[23m 2020-03-16 11:30:18, 2020-03-16 10:58:48, 2020-03-16 10:46~
$ education  [3m[90m<chr>[39m[23m "本科", "本科", "不限", "本科", "本科", "本科", "硕士", "本~
$ salary     [3m[90m<dbl>[39m[23m 27500, 30000, 27500, 16500, 15000, 14000, 23000, 12500, 700~



5112 bytes

27（查看数据）：查看数据汇总信息

summary(df)

   createTime                   education             salary     
 Min.   :2020-03-13 18:01:31   Length:135         Min.   : 3500  
 1st Qu.:2020-03-16 10:41:19   Class :character   1st Qu.:14000  
 Median :2020-03-16 11:00:27   Mode  :character   Median :17500  
 Mean   :2020-03-16 10:16:35                      Mean   :19159  
 3rd Qu.:2020-03-16 11:19:03                      3rd Qu.:25000  
 Max.   :2020-03-16 11:36:07                      Max.   :45000

28（修改列）：新增一列将salary离散化为三水平值

case_when函数，允许定义多个if_else()语句，等效于SQL中CASE WHEN 语句

df = df %>%
  mutate(class = case_when(
    salary >= 0 & salary < 5000      ~ "低",
    salary >= 5000 & salary < 20000 ~ "中",
TRUE                                ~ "高"))      
  # 如果salary的取值在[0,5000)，赋值class=低；
  # 如果salary的取值在[5000,20000)，赋值class=中；
  # 如果salary的取值在[20000,+∞)，赋值class=高；
  # TRUE的效果是其它，等效于salry>=20000

head(df)

A tibble: 6 × 4
createTime	education	salary	class
<dttm>	<chr>	<dbl>	<chr>
2020-03-16 11:30:18	本科	27500	高
2020-03-16 10:58:48	本科	30000	高
2020-03-16 10:46:39	不限	27500	高
2020-03-16 10:45:44	本科	16500	中
2020-03-16 10:20:41	本科	15000	中
2020-03-16 10:33:48	本科	14000	中

29（数据整理）：按salary列对数据降序排列

df = df %>% 
  arrange(desc(salary))
  # 同#19

30（筛选行）：提取第33行数据

df[33,]

A tibble: 1 × 4
createTime	education	salary	class
<dttm>	<chr>	<dbl>	<chr>
2020-03-16 10:54:56	本科	25000	高

P31-40

31（数据计算）：计算salary列的中位数

计算中位数median()

median(df$salary)

17500

df %>% 
  summarise(salary_med = median(salary))
# 创建新的tibble 1*1，将salary列的中位数赋值给salary_med

A tibble: 1 × 1
salary_med
<dbl>
17500

32（数据可视化）：绘制salary列的频率分布直方图

png('./data_R/out/32_直方图.png')
df %>%
  ggplot(aes(x = salary)) +
  geom_histogram(bins = 10)
  # salary设定为x轴绘图，设定图形格式为直方图，分为10组
dev.off()

png: 2

33（数据可视化）：绘制salary的频率密度曲线图

png('./data_R/out/33_曲线图.png')
df %>%
  ggplot(aes(x = salary)) +
  geom_density()
  # 同#32，设定图形格式为密度曲线
dev.off()

png: 2

34（数据删除）：删除列class

head(
    df %>% 
      select(-class)
)
# 选择class列删除
# 或者同#6的补充，给class列赋空值即删去，如下

head(
    df %>%
      mutate(class = NULL)
)

A tibble: 6 × 3
createTime	education	salary
<dttm>	<chr>	<dbl>
2020-03-16 11:30:17	本科	45000
2020-03-16 11:04:00	本科	40000
2020-03-16 10:36:57	本科	37500
2020-03-16 11:01:39	本科	37500
2020-03-16 09:54:47	硕士	37500
2020-03-16 11:01:22	本科	35000

A tibble: 6 × 3
createTime	education	salary
<dttm>	<chr>	<dbl>
2020-03-16 11:30:17	本科	45000
2020-03-16 11:04:00	本科	40000
2020-03-16 10:36:57	本科	37500
2020-03-16 11:01:39	本科	37500
2020-03-16 09:54:47	硕士	37500
2020-03-16 11:01:22	本科	35000

35（数据操作）：将df的第1列与第2列合并为新的一列

unite函数
"newcol"为合并列的新命名，1：2为合并1、2列，sep = " "为合并中间间隔空格
合并命名为newcol，1、2、3列，合并列由逗号区分

head(
    df %>% 
      unite("newcol", 1:2, sep = " ")
)

A tibble: 6 × 3
newcol	salary	class
<chr>	<dbl>	<chr>
2020-03-16 11:30:17 本科	45000	高
2020-03-16 11:04:00 本科	40000	高
2020-03-16 10:36:57 本科	37500	高
2020-03-16 11:01:39 本科	37500	高
2020-03-16 09:54:47 硕士	37500	高
2020-03-16 11:01:22 本科	35000	高

36（数据操作）：将education列与第salary列合并为新的一列

head(
    df %>% unite("newcol", c(education, salary), sep = " ")
    )
  # 同#35

A tibble: 6 × 3
createTime	newcol	class
<dttm>	<chr>	<chr>
2020-03-16 11:30:17	本科 45000	高
2020-03-16 11:04:00	本科 40000	高
2020-03-16 10:36:57	本科 37500	高
2020-03-16 11:01:39	本科 37500	高
2020-03-16 09:54:47	硕士 37500	高
2020-03-16 11:01:22	本科 35000	高

37（数据计算）：计算salary最大值与最小值之差

max(df$salary) - min(df$salary)

41500

38（数据操作）：将第一行与最后一行拼接

bind_rows(df[1,], df[nrow(df),])
# 第一行df[1,]
# 最后一行df[nrow(df),]

A tibble: 2 × 4
createTime	education	salary	class
<dttm>	<chr>	<dbl>	<chr>
2020-03-16 11:30:17	本科	45000	高
2020-03-16 10:48:43	本科	3500	低

39（数据操作）：将第8行添加到末尾

bind_rows(df, df[8,]) %>%  tail()
#将第八行合并至末尾，同#38
# 显示末尾行

A tibble: 6 × 4
createTime	education	salary	class
<dttm>	<chr>	<dbl>	<chr>
2020-03-16 11:20:44	本科	4500	低
2020-03-16 11:12:04	本科	4000	低
2020-03-16 11:12:04	本科	4000	低
2020-03-16 10:44:23	不限	3500	低
2020-03-16 10:48:43	本科	3500	低
2020-03-16 10:41:20	本科	35000	高

40（查看数据）：查看每一列的数据类型

df %>% 
  glimpse()
  # 同#26

Rows: 135
Columns: 4
$ createTime [3m[90m<dttm>[39m[23m 2020-03-16 11:30:17, 2020-03-16 11:04:00, 2020-03-16 10:36~
$ education  [3m[90m<chr>[39m[23m "本科", "本科", "本科", "本科", "硕士", "本科", "本科", "本~
$ salary     [3m[90m<dbl>[39m[23m 45000, 40000, 37500, 37500, 37500, 35000, 35000, 35000, 325~
$ class      [3m[90m<chr>[39m[23m "高", "高", "高", "高", "高", "高", "高", "高", "高", "高",~

P41-50

41（数据操作）：将createTime列设置为行索引

head(
df %>% distinct(createTime, .keep_all = TRUE) %>% column_to_rownames("createTime")
)

A data.frame: 6 × 3
	education	salary	class
	<chr>	<dbl>	<chr>
2020-03-16 11:30:17	本科	45000	高
2020-03-16 11:04:00	本科	40000	高
2020-03-16 10:36:57	本科	37500	高
2020-03-16 11:01:39	本科	37500	高
2020-03-16 09:54:47	硕士	37500	高
2020-03-16 11:01:22	本科	35000	高

42（数据创建）：生成一个和df长度相同的随机数数据框

df1 = tibble(rnums = sample.int(10, nrow(df), replace = TRUE))
head(df1)
# 创建这样的一个tibble数据框，将随机数赋值给irnums，要求为：数据为10以内正整数，数量同df，可以重复

A tibble: 6 × 1
rnums
<int>
9
10
7
9
5
4

43（数据连接）：将上面生成的数据框与df按列合并

df = bind_cols(df, df1)
head(df)

A tibble: 6 × 5
createTime	education	salary	class	rnums
<dttm>	<chr>	<dbl>	<chr>	<int>
2020-03-16 11:30:17	本科	45000	高	9
2020-03-16 11:04:00	本科	40000	高	10
2020-03-16 10:36:57	本科	37500	高	7
2020-03-16 11:01:39	本科	37500	高	9
2020-03-16 09:54:47	硕士	37500	高	5
2020-03-16 11:01:22	本科	35000	高	4

44（修改列）：生成新列new为salary列减去随机数列

df = df %>% 
  mutate(new = salary - rnums)

45（检查缺失值）：检查数据中是否含有任何缺失值

anyNA(df)

FALSE

46（类型转换）：将rnums列的类型转换为浮点数

head(
    df %>% mutate(rnums = as.double(rnums))
  # rnums为int型，使用as.double进行转换，其他形式也类比
)

A tibble: 6 × 6
createTime	education	salary	class	rnums	new
<dttm>	<chr>	<dbl>	<chr>	<dbl>	<dbl>
2020-03-16 11:30:17	本科	45000	高	9	44991
2020-03-16 11:04:00	本科	40000	高	10	39990
2020-03-16 10:36:57	本科	37500	高	7	37493
2020-03-16 11:01:39	本科	37500	高	9	37491
2020-03-16 09:54:47	硕士	37500	高	5	37495
2020-03-16 11:01:22	本科	35000	高	4	34996

47（数据汇总）：计算salary列大于10000的次数

df %>% 
  count(salary > 10000)
  # 同#5

A tibble: 2 × 2
salary > 10000	n
<lgl>	<int>
FALSE	16
TRUE	119

  # 或者利用summarise函数进行汇总，如下
df %>% 
  summarise(n = sum(salary > 10000))

A tibble: 1 × 1
n
<int>
119

48（统计频数）：查看每种学历出现的次数

df %>% count(education)

A tibble: 4 × 2
education	n
<chr>	<int>
本科	119
不限	5
大专	4
硕士	7

table(df$education)

本科不限大专硕士
119 5 4 7

49（数据汇总）：查看education列共有几种学历

df %>% distinct(education)
# 同#8，此题默认.keep_all = FALSE，也就是除去education都不显示

A tibble: 4 × 1
education
<chr>
本科
硕士
不限
大专

50（筛选行）：提取salary与new列之和大于60000的最后3行

df %>% filter(salary + new > 60000) %>% slice((n()-2):n())
# filter过滤出salary+new>6000的行
# slice选择其中的后三行

A tibble: 3 × 6
createTime	education	salary	class	rnums	new
<dttm>	<chr>	<dbl>	<chr>	<int>	<dbl>
2020-03-16 10:41:20	本科	35000	高	4	34996
2020-03-16 10:41:20	本科	35000	高	7	34993
2020-03-16 11:30:10	本科	32500	高	8	32492

P51-60

51（读取数据）：使用绝对路径读取本地Excel数据

df <- readxl::read_xls("C:/Users/86156/.jupyter/Code/Jupyter-Notebook/R_Python/data_R/51-80数据.xls")
head(df, 3)

Registered S3 methods overwritten by 'tibble':
  method     from  
  format.tbl pillar
  print.tbl  pillar

A tibble: 3 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_1_4	16.1356	16.1444	16.1444	15.4997	15.7205	42240610	754425783	-0.4151	-2.5725	17.860199999999999	0.22639999999999999	3.32e+11	3.32e+11	18653471415	6.5614
600000.SH	浦发银行	2016_1_5	15.7205	15.4644	15.9501	15.3672	15.8618	58054793	1034181474	0.1413	0.8989	17.8139	0.31119999999999998	3.35e+11	3.35e+11	18653471415	6.6204
600000.SH	浦发银行	2016_1_6	15.8618	15.8088	16.0208	15.6234	15.9855	46772653	838667398	0.1236	0.7795	17.930700000000002	0.25069999999999998	3.38e+11	3.38e+11	18653471415	6.6720

#df <- read.csv('./data_R/51-80数据.csv')

52（查看数据）：查看数据框的前3行

#head(df,3)
df %>% head(3)

A tibble: 3 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_1_4	16.1356	16.1444	16.1444	15.4997	15.7205	42240610	754425783	-0.4151	-2.5725	17.860199999999999	0.22639999999999999	3.32e+11	3.32e+11	18653471415	6.5614
600000.SH	浦发银行	2016_1_5	15.7205	15.4644	15.9501	15.3672	15.8618	58054793	1034181474	0.1413	0.8989	17.8139	0.31119999999999998	3.35e+11	3.35e+11	18653471415	6.6204
600000.SH	浦发银行	2016_1_6	15.8618	15.8088	16.0208	15.6234	15.9855	46772653	838667398	0.1236	0.7795	17.930700000000002	0.25069999999999998	3.38e+11	3.38e+11	18653471415	6.6720

53（查看缺失值）：查看每列数据缺失值情况

naniar包探索缺失值的数据结构

library(naniar)

#colSums(is.na(df))

df %>% miss_var_summary()

A tibble: 18 × 3
variable	n_miss	pct_miss
<chr>	<int>	<dbl>
代码	0	0
简称	0	0
日期	0	0
前收盘价(元)	0	0
开盘价(元)	0	0
最高价(元)	0	0
最低价(元)	0	0
收盘价(元)	0	0
成交量(股)	0	0
成交金额(元)	0	0
涨跌(元)	0	0
涨跌幅(%)	0	0
均价(元)	0	0
换手率(%)	0	0
A股流通市值(元)	0	0
总市值(元)	0	0
A股流通股本(股)	0	0
市盈率	0	0

54（查看缺失值）：查看日期列含有缺失值的行

df %>% 
  filter(is.na(日期))

which(is.na(df$日期))   # 列缺失的行号

A tibble: 0 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>

55（查看缺失值）：查看每列缺失值在哪些行

1

naIdx = df %>%
  where_na()         # 返回NA的行列索引, 需要naniar包
 
split(naIdx[,1], naIdx[,2])

2

library(glue)
 
for (i in names(df)){
  if(sum(is.na(df[,'日期'])) != 0){
    res1 <- which(is.na(df[,i]))
    res2 <- paste(res1,collapse = ',')
    print(glue('列名："{i}", 第[{res2}]行有缺失值'))
  }
}

载入程辑包：'glue'

The following object is masked from ‘package:dplyr’:

    collapse

56（缺失值处理）：删除所有存在缺失值的行

注：若要删除某些列包含缺失值的行，提供列名即可

#df %>% drop_na()

57（数据可视化）：绘制收盘价的折线图

library(ggplot2)

png('./data_R/out/57_折线图.png')
df %>% ggplot(aes(日期, `收盘价(元)`,group = 1)) + geom_line()     #必须加上group = 1
dev.off()

png: 2

58（数据可视化）：同时绘制开盘价与收盘价

png('./data_R/out/58_折线图.png')
df %>%
  select(日期, `开盘价(元)`, `收盘价(元)`) %>% 
  pivot_longer(-日期, 
               names_to = "type",
               values_to = "price") %>%
  ggplot(aes(日期, price, color = type, group = 1)) +    #必须加上group = 1，不然图片出不来
  geom_line() 

dev.off()

png: 2

59（数据可视化）：绘制涨跌幅的直方图

png('./data_R/out/59_涨跌幅直方图.png')

df %>% ggplot(aes(`涨跌幅(%)`)) + geom_histogram()
# 可以指定bins
dev.off()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

png: 2

60（数据可视化）：让直方图更细致

png('./data_R/out/60_涨跌幅直方图.png')

df %>% 
  ggplot(aes(`涨跌幅(%)`)) +
  geom_histogram(bins = 40)

dev.off()

png: 2

P61-70

61（数据创建）：用df的列名创建数据框

as_tibble(names(df))

A tibble: 18 × 1
value
<chr>
代码
简称
日期
前收盘价(元)
开盘价(元)
最高价(元)
最低价(元)
收盘价(元)
成交量(股)
成交金额(元)
涨跌(元)
涨跌幅(%)
均价(元)
换手率(%)
A股流通市值(元)
总市值(元)
A股流通股本(股)
市盈率

62（异常值处理）：输出所有换手率不是数字的行

#换手率这一列属性为chr，需要先强转数值型
#如果转换失败会变成NA，判断即可
df[is.na(as.numeric(df$`换手率(%)`)),]

Warning message in `[.tbl_df`(df, is.na(as.numeric(df$`换手率(%)`)), ):
"强制改变过程中产生了NA"

A tibble: 18 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_2_16	16.2946	16.2946	16.2946	16.2946	16.2946	NA	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_17	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_18	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_19	16.2946	16.2946	16.2946	16.2946	16.2946	--	NA	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_22	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_23	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_24	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_25	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	NA	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_26	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_29	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_1	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_2	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_3	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_4	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_7	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	NA	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_8	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_9	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_10	16.2946	16.2946	16.2946	16.2946	16.2946	NA	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801

head(
    df %>% 
          mutate(`换手率(%)` = parse_number(`换手率(%)`)) %>% 
          filter(is.na(`换手率(%)`))
    )

Warning message:
"17 parsing failures.
row col expected actual
 27  -- a number     --
 28  -- a number     --
 29  -- a number     --
 30  -- a number     --
 31  -- a number     --
... ... ........ ......
See problems(...) for more details.
"

A tibble: 6 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_2_16	16.2946	16.2946	16.2946	16.2946	16.2946	NA	--	0	0	--	NA	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_17	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	NA	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_18	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	NA	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_19	16.2946	16.2946	16.2946	16.2946	16.2946	--	NA	0	0	--	NA	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_22	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	NA	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_23	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	NA	3.44e+11	3.44e+11	18653471415	6.801

63（异常值处理）：输出所有换手率为–的行

df %>% 
  filter(`换手率(%)` == "--")

A tibble: 17 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_2_16	16.2946	16.2946	16.2946	16.2946	16.2946	NA	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_17	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_18	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_19	16.2946	16.2946	16.2946	16.2946	16.2946	--	NA	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_22	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_23	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_24	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_25	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	NA	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_26	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_2_29	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_1	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_2	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_3	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_4	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_8	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_9	16.2946	16.2946	16.2946	16.2946	16.2946	--	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801
600000.SH	浦发银行	2016_3_10	16.2946	16.2946	16.2946	16.2946	16.2946	NA	--	0	0	--	--	3.44e+11	3.44e+11	18653471415	6.801

64（数据操作）：重置df的行号

rownames(df) = NULL    # R中无行号就是数字索引    
# 如果是tibble则索引始终是按顺序

65（异常值处理）：删除所有换手率为非数字的行

df <- df %>% 
  mutate(`换手率(%)` = parse_number(`换手率(%)`)) %>% 
  filter(!is.na(`换手率(%)`))

Warning message:
"17 parsing failures.
row col expected actual
 27  -- a number     --
 28  -- a number     --
 29  -- a number     --
 30  -- a number     --
 31  -- a number     --
... ... ........ ......
See problems(...) for more details.
"

#df[!is.na(as.numeric(df$`换手率(%)`)),]
# 或者根据前几题的经验，非数字就是'--'
df <- df %>%
  filter(`换手率(%)` != '--')

为了便于后续处理，做批量数值型转化, 并转化为tsibble对象

# library(tsibble)
 
# df <- df %>% 
#   mutate_at(vars(4:18), as.numeric) %>% 
#   mutate(日期 = lubridate::as_date(日期)) %>%
#   as_tsibble(index = 日期, key = c(代码, 简称))

66（数据可视化）：绘制换手率的密度曲线

png('./data_R/out/66_换手率的密度曲线.png')

df$`换手率(%)` <- as.double(df$`换手率(%)`)
ggplot(df) +
  geom_density(aes(`换手率(%)`))

dev.off()

png: 2

67（数据计算）：计算前一天与后一天收盘价的差值

summarise和mutate函数都可以对一个数据框的某一列(而不是整个数据框)进行修改和汇总，两者的主要区别在于返回结果的方式不同，
其中summarise函数返回一个只包含修改或汇总后数据的数据框，
而mutate函数则返回一个由原始数据和修改或汇总后数据两部分构成的数据框

delta <- df %>%
  summarise(delta = `收盘价(元)` - lag(`收盘价(元)`))
head(delta)

A tibble: 6 × 1
delta
<dbl>
NA
0.1413
0.1237
-0.5211
-0.0177
-0.3886

delta <- df %>% 
  mutate(delta = `收盘价(元)` - lag(`收盘价(元)`)) %>% 
  select(日期, `收盘价(元)`, delta)
head(delta)

A tibble: 6 × 3
日期	收盘价(元)	delta
<chr>	<dbl>	<dbl>
2016_1_4	15.7205	NA
2016_1_5	15.8618	0.1413
2016_1_6	15.9855	0.1237
2016_1_7	15.4644	-0.5211
2016_1_8	15.4467	-0.0177
2016_1_11	15.0581	-0.3886

68（数据计算）：计算前一天与后一天收盘价的变化率

lead()/lag()函数
通过这两个函数可以在一次查询中取出同一字段的前N行的数据(lag)和后N行的数据(lead)作为独立的列

pct_change <- df %>% summarise(pct_change = (`收盘价(元)` - lag(`收盘价(元)`))/lag(`收盘价(元)`))
head(pct_change)

A tibble: 6 × 1
pct_change
<dbl>
NA
0.008988264
0.007798610
-0.032598292
-0.001144564
-0.025157477

pct_change <- df %>% 
  mutate(change = (`收盘价(元)` - lag(`收盘价(元)`)) / `收盘价(元)`) %>% 
  select(日期, `收盘价(元)`, change)
head(pct_change)

A tibble: 6 × 3
日期	收盘价(元)	change
<chr>	<dbl>	<dbl>
2016_1_4	15.7205	NA
2016_1_5	15.8618	0.008908195
2016_1_6	15.9855	0.007738263
2016_1_7	15.4644	-0.033696749
2016_1_8	15.4467	-0.001145876
2016_1_11	15.0581	-0.025806709

69（数据操作）：设置日期为行索引

head(
    df %>% column_to_rownames("日期")
)

A data.frame: 6 × 17
	代码	简称	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
2016_1_4	600000.SH	浦发银行	16.1356	16.1444	16.1444	15.4997	15.7205	42240610	754425783	-0.4151	-2.5725	17.860199999999999	0.2264	3.32e+11	3.32e+11	18653471415	6.5614
2016_1_5	600000.SH	浦发银行	15.7205	15.4644	15.9501	15.3672	15.8618	58054793	1034181474	0.1413	0.8989	17.8139	0.3112	3.35e+11	3.35e+11	18653471415	6.6204
2016_1_6	600000.SH	浦发银行	15.8618	15.8088	16.0208	15.6234	15.9855	46772653	838667398	0.1236	0.7795	17.930700000000002	0.2507	3.38e+11	3.38e+11	18653471415	6.6720
2016_1_7	600000.SH	浦发银行	15.9855	15.7205	15.8088	15.3672	15.4644	11350479	199502702	-0.5211	-3.2597	17.576599999999999	0.0608	3.27e+11	3.27e+11	18653471415	6.4545
2016_1_8	600000.SH	浦发银行	15.4644	15.6675	15.7912	14.9345	15.4467	71918296	1262105060	-0.0177	-0.1142	17.549199999999999	0.3855	3.26e+11	3.26e+11	18653471415	6.4471
2016_1_11	600000.SH	浦发银行	15.4467	15.1994	15.4114	14.9786	15.0581	90177135	1550155933	-0.3886	-2.5157	17.190100000000001	0.4834	3.18e+11	3.18e+11	18653471415	6.2849

70（数据计算）：对收盘价做步长为5的滑动平均

library(slider)
head(
        df %>%
          mutate(avg_5 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, 
                                   .before = 2, .after = 2)) %>% 
          select(日期, `收盘价(元)`, avg_5)
)

A tibble: 6 × 3
日期	收盘价(元)	avg_5
<chr>	<dbl>	<dbl>
2016_1_4	15.7205	15.85593
2016_1_5	15.8618	15.75805
2016_1_6	15.9855	15.69578
2016_1_7	15.4644	15.56330
2016_1_8	15.4467	15.47322
2016_1_11	15.0581	15.34780

以5个数据作为一个数据滑动窗口，在这个5个数据上取均值(收盘价)

# library(RcppRoll)
# df %>% transmute(avg_5 = roll_mean(`收盘价(元)`,n = 5,align="right",fill = NA))

P71-80

71（数据计算）：对收盘价做步长为5的滑动求和

head(
        df %>%
          mutate(sum_5 = slide_dbl(`收盘价(元)`, sum, na.rm = TRUE, 
                                   .before = 2, .after = 2)) %>% 
          select(日期, `收盘价(元)`, sum_5)
)

A tibble: 6 × 3
日期	收盘价(元)	sum_5
<chr>	<dbl>	<dbl>
2016_1_4	15.7205	47.5678
2016_1_5	15.8618	63.0322
2016_1_6	15.9855	78.4789
2016_1_7	15.4644	77.8165
2016_1_8	15.4467	77.3661
2016_1_11	15.0581	76.7390

# df %>%
#   transmute(sum_5 = roll_sum(`收盘价(元)`,n = 5,align="right",fill = NA))

72（数据可视化）：将收盘价及其5日均线、20日均线绘制在同一个图上

#df$`收盘价(元)` <- as.numeric(df$`收盘价(元)`)
head(df)

A tibble: 6 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_1_4	16.1356	16.1444	16.1444	15.4997	15.7205	42240610	754425783	-0.4151	-2.5725	17.860199999999999	0.2264	3.32e+11	3.32e+11	18653471415	6.5614
600000.SH	浦发银行	2016_1_5	15.7205	15.4644	15.9501	15.3672	15.8618	58054793	1034181474	0.1413	0.8989	17.8139	0.3112	3.35e+11	3.35e+11	18653471415	6.6204
600000.SH	浦发银行	2016_1_6	15.8618	15.8088	16.0208	15.6234	15.9855	46772653	838667398	0.1236	0.7795	17.930700000000002	0.2507	3.38e+11	3.38e+11	18653471415	6.6720
600000.SH	浦发银行	2016_1_7	15.9855	15.7205	15.8088	15.3672	15.4644	11350479	199502702	-0.5211	-3.2597	17.576599999999999	0.0608	3.27e+11	3.27e+11	18653471415	6.4545
600000.SH	浦发银行	2016_1_8	15.4644	15.6675	15.7912	14.9345	15.4467	71918296	1262105060	-0.0177	-0.1142	17.549199999999999	0.3855	3.26e+11	3.26e+11	18653471415	6.4471
600000.SH	浦发银行	2016_1_11	15.4467	15.1994	15.4114	14.9786	15.0581	90177135	1550155933	-0.3886	-2.5157	17.190100000000001	0.4834	3.18e+11	3.18e+11	18653471415	6.2849

png('./data_R/out/72_收盘价及其均线1.png')

df %>%
  mutate(avg_5 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, .before = 2, .after = 2),
         avg_20 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, .before = 10, .after = 9)) %>% 
  pivot_longer(c(`收盘价(元)`, avg_5, avg_20),
                  names_to = "type",
                  values_to = "price") %>% 
  ggplot(aes(日期, price, color = type,group = 1)) +
    geom_line()
dev.off()

png: 2

# png('./data_R/out/72_收盘价及其均线2.png')
# df %>%
#   mutate(avg_5 = roll_mean(`收盘价(元)`,n = 5,align="right",fill = NA),
#          avg_20 = roll_mean(`收盘价(元)`,n = 20,align="right",fill = NA)) %>%
#   ggplot() +
#   geom_line(aes(日期,`收盘价(元)`,group = 1),color = 'steelblue',size = 1.2) +
#   geom_line(aes(日期,avg_5,group = 1),color = 'orange',size = 1.2) +
#   geom_line(aes(日期,avg_20,group = 1),color = 'green',size = 1.2)

# dev.off()

73（数据重采样）：按周为采样规则，计算一周收盘价最大值

# weekmax = df %>% 
#   index_by(weeks = ~ yearweek(.)) %>%    #周度汇总
#   summarise(max_week = max(`收盘价(元)`, na.rm = TRUE))
# weekmax

# library(plyr)
 
# res <- dlply(df,.(cut(日期,"7")),"[")
# res_max <- sapply(res,function(n)max(n$`收盘价(元)`),simplify=TRUE)
# as.data.frame(res_max)

Error in cut.default(日期, "7"): 'x' 必需是数值
Traceback:


1. dlply(df, .(cut(日期, "7")), "[")

2. splitter_d(.data, .variables, drop = .drop)

3. eval.quoted(.variables, data)

4. lapply(exprs, eval, envir = envir, enclos = enclos)

5. FUN(X[[i]], ...)

6. FUN(X[[i]], ...)

7. cut(日期, "7")

8. cut.default(日期, "7")

9. stop("'x' must be numeric")

74（数据可视化）：绘制重采样数据与原始数据

# png('./data_R/out/74_重采样数据与原始数据.png')
# ggplot() +
#   geom_line(data = weekmax, aes(weeks, max_week), color = "red") +
#   geom_line(data = df, aes(日期, `收盘价(元)`), color = "steelblue")  
# dev.off()

75（数据操作）：将数据往后移动5天

这是批量做后移，单个变量做后移用mutate(var = lag(var, 5)即可。

head(
bind_cols(df[,1:3], map_dfc(df[,-(1:3)], lag, n = 5)), 10
)

A tibble: 10 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_1_4	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
600000.SH	浦发银行	2016_1_5	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
600000.SH	浦发银行	2016_1_6	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
600000.SH	浦发银行	2016_1_7	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
600000.SH	浦发银行	2016_1_8	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
600000.SH	浦发银行	2016_1_11	16.1356	16.1444	16.1444	15.4997	15.7205	42240610	754425783	-0.4151	-2.5725	17.860199999999999	0.2264	3.32e+11	3.32e+11	18653471415	6.5614
600000.SH	浦发银行	2016_1_12	15.7205	15.4644	15.9501	15.3672	15.8618	58054793	1034181474	0.1413	0.8989	17.8139	0.3112	3.35e+11	3.35e+11	18653471415	6.6204
600000.SH	浦发银行	2016_1_13	15.8618	15.8088	16.0208	15.6234	15.9855	46772653	838667398	0.1236	0.7795	17.930700000000002	0.2507	3.38e+11	3.38e+11	18653471415	6.6720
600000.SH	浦发银行	2016_1_14	15.9855	15.7205	15.8088	15.3672	15.4644	11350479	199502702	-0.5211	-3.2597	17.576599999999999	0.0608	3.27e+11	3.27e+11	18653471415	6.4545
600000.SH	浦发银行	2016_1_15	15.4644	15.6675	15.7912	14.9345	15.4467	71918296	1262105060	-0.0177	-0.1142	17.549199999999999	0.3855	3.26e+11	3.26e+11	18653471415	6.4471

76（数据操作）：将数据往前移动5天

head(
bind_cols(df[,1:3], map_dfc(df[,-(1:3)], lead, n = 5)), 5
)

A tibble: 5 × 18
代码	简称	日期	前收盘价(元)	开盘价(元)	最高价(元)	最低价(元)	收盘价(元)	成交量(股)	成交金额(元)	涨跌(元)	涨跌幅(%)	均价(元)	换手率(%)	A股流通市值(元)	总市值(元)	A股流通股本(股)	市盈率
<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
600000.SH	浦发银行	2016_1_4	15.4467	15.1994	15.4114	14.9786	15.0581	90177135	1550155933	-0.3886	-2.5157	17.190100000000001	0.4834	3.18e+11	3.18e+11	18653471415	6.2849
600000.SH	浦发银行	2016_1_5	15.0581	15.1641	15.4732	15.0846	15.4114	55374454	964061502	0.3533	2.3460	17.4099	0.2969	3.26e+11	3.26e+11	18653471415	6.4324
600000.SH	浦发银行	2016_1_6	15.4114	15.5174	15.8088	15.3231	15.3584	47869312	843717365	-0.0530	-0.3438	17.625399999999999	0.2566	3.24e+11	3.24e+11	18653471415	6.4102
600000.SH	浦发银行	2016_1_7	15.3584	15.0140	15.8883	14.9168	15.8265	54838833	966117848	0.4681	3.0477	17.6174	0.2940	3.34e+11	3.34e+11	18653471415	6.6056
600000.SH	浦发银行	2016_1_8	15.8265	15.7205	16.0296	15.4732	15.5262	46723139	836146426	-0.3003	-1.8973	17.895800000000001	0.2505	3.28e+11	3.28e+11	18653471415	6.4803

77（数据操作）：计算开盘价的累积平均

rlt = df %>% 
  mutate(累积平均 = cummean(`开盘价(元)`)) %>% 
  select(日期, `开盘价(元)`, 累积平均)
head(rlt)

A tibble: 6 × 3
日期	开盘价(元)	累积平均
<chr>	<dbl>	<dbl>
2016_1_4	16.1444	16.14440
2016_1_5	15.4644	15.80440
2016_1_6	15.8088	15.80587
2016_1_7	15.7205	15.78453
2016_1_8	15.6675	15.76112
2016_1_11	15.1994	15.66750

78（数据计算）：绘制开盘价的累积平均与原始数据的折线图

height, width 和 res 分别为绘图框高、宽和图的分辨率

png('./data_R/out/78_开盘价的累积平均与原始数据的折线图1.png', height=480,width=700)
rlt %>% 
  pivot_longer(-日期, names_to = "type", values_to = "price") %>% 
  ggplot(aes(日期, price, color = type, group = 1)) +
    geom_line()

dev.off()

png: 2

png('./data_R/out/78_开盘价的累积平均与原始数据的折线图2.png', height=480,width=700)
res <- df %>%
  transmute(cummean = cumsum(`开盘价(元)`)/1:dim(df)[1])

df %>%
  cbind(res) %>%
  dplyr::rename(Opening_Price = `开盘价(元)`,
         Expanding_Open_Mean = cummean) %>%
  select(日期,Opening_Price,Expanding_Open_Mean) %>%
  pivot_longer(c(Opening_Price,Expanding_Open_Mean),
               names_to = 'type',
               values_to ='price') %>%
  ggplot(aes(日期,price,color = type, group=1)) +
  geom_line(size=1.2) +
  scale_color_manual(values=c('orange','steelblue')) +
  theme_bw() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.title = element_blank(),
    legend.position = c(0.9, 0.9)
  )

dev.off()

png: 2

79（数据计算）：计算布林指标

boll = df %>%
  mutate(avg_20 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, .before = 10, .after = 9), 
         sd_20 = slide_dbl(`收盘价(元)`, sd, na.rm = TRUE, .before = 10, .after = 9),
         up = avg_20 + 2 * sd_20,
         down = avg_20 - 2 * sd_20) %>% 
  select(日期, `收盘价(元)`, avg_20, up, down)

 boll %>% sample_n(10)

A tibble: 10 × 5
日期	收盘价(元)	avg_20	up	down
<chr>	<dbl>	<dbl>	<dbl>	<dbl>
2017_2_23	16.6900	16.65150	17.00604	16.29696
2016_5_16	15.2701	15.54476	16.01340	15.07612
2017_2_17	16.6400	16.70400	16.94526	16.46274
2016_5_26	15.6322	15.64324	16.22392	15.06256
2016_8_25	16.3400	16.44150	16.79125	16.09175
2017_4_14	15.5100	15.56500	16.44007	14.68993
2017_4_11	15.8900	15.70000	16.52027	14.87973
2016_5_30	15.8530	15.68387	16.21329	15.15445
2017_1_26	16.7400	16.61150	16.97760	16.24540
2016_6_1	16.0650	15.72803	16.16838	15.28767

80（数据可视化）：绘制布林曲线

png('./data_R/out/80_布林曲线.png', height=480,width=2000)
boll %>% 
  pivot_longer(-日期, names_to = "type", values_to = "price") %>% 
  ggplot(aes(日期, price, color = type,group=1)) + 
    geom_line()
dev.off()

png: 2

P81-90

81（加载查看包）：加载并查看tidyverse包版本

packageVersion("tidyverse")

[1] '1.3.1'

packageVersion("dplyr")

[1] '1.0.8'

82（生成随机数）：生成20个0~100的随机数，创建数据框

sample.int() 随机抽样函数

set.seed(123)                # 保证结果出现
df1 = tibble(nums = sample.int(100, 20))
df1

A tibble: 20 × 1
nums
<int>
31
79
51
14
67
42
50
43
97
25
90
69
57
9
72
26
7
95
87
36

83（生成等差数）：生成20个0~100固定步长的数，创建数据框

seq()函数是产生等距间隔数列的函数
seq(frme,to,by)
by是步长

df2 = tibble(nums = seq(0, 99, by = 5))
df2

A tibble: 20 × 1
nums
<dbl>
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95

84（生成指定分布随机数）：生成20个标准正态分布的随机数，创建数据框

R语言中rnorm函数 rnorm(n, mean = 0, sd = 1) n 为产生随机值个数(长度),mean 是平均数, sd 是标准差

set.seed(111)    #set.seed()函数是为了保证你随机生成的随机数前后一致
df3 = tibble(nums = rnorm(20, 0, 1))
df3

A tibble: 20 × 1
nums
<dbl>
0.23522071
-0.33073587
-0.31162382
-2.30234566
-0.17087604
0.14027823
-1.49742666
-1.01018842
-0.94847560
-0.49396222
-0.17367413
-0.40659878
1.84563626
0.39405411
0.79752850
-1.56666536
-0.08585101
-0.35913948
-1.19360897
0.36418674

85（合并数据）：将df1, df2, df3按行合并为新数据框

#bind_rows(df1, df2, df3)
rbind(df1, df2, df3)

86（合并数据）：将df1, df2, df3按列合并为新数据框

df = cbind(df1, df2, df3)
names(df) <- c(0,1,2)
head(df)

A data.frame: 6 × 3
	0	1	2
	<int>	<dbl>	<dbl>
1	31	0	0.2352207
2	79	5	-0.3307359
3	51	10	-0.3116238
4	14	15	-2.3023457
5	67	20	-0.1708760
6	42	25	0.1402782

87（查看数据）：查看df所有数据的最小值、25%分位数、中位数、75%分位数、最大值

summary(unlist(df))
# unlist(df) %>% summary()

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-2.3024  0.1052 25.5000 33.1654 61.2500 97.0000

88（修改列名）：修改列名为col1, col2, col3

若只修改个别列名，用rename(newname=oldname).

names(df) <- c('col1','col2','col3')
head(df)

A data.frame: 6 × 3
	col1	col2	col3
	<int>	<dbl>	<dbl>
1	31	0	0.2352207
2	79	5	-0.3307359
3	51	10	-0.3116238
4	14	15	-2.3023457
5	67	20	-0.1708760
6	42	25	0.1402782

89（数据操作）：提取在第1列中而不在第2列中的数

setdiff(df$col1, df$col2)

90（数据操作）：提取在第1列和第2列出现频率最高的三个数字

count(unlist(c(df$col1,df$col2))) %>%
  arrange(desc(freq)) %>%
  filter(row_number() <= 3)

A data.frame: 3 × 2
x	freq
<dbl>	<int>
25	2
50	2
90	2

P91-100

91（数据操作）：提取第1列可以整除5的数的位置

which(df$col1 %% 5 == 0)

92（数据计算）：计算第1列的1阶差分

df %>% mutate(diff1 = col1 - lag(col1))   #若只是要数值，用diff(df$col1)即可

A data.frame: 20 × 4
col1	col2	col3	diff1
<int>	<dbl>	<dbl>	<int>
31	0	0.23522071	NA
79	5	-0.33073587	48
51	10	-0.31162382	-28
14	15	-2.30234566	-37
67	20	-0.17087604	53
42	25	0.14027823	-25
50	30	-1.49742666	8
43	35	-1.01018842	-7
97	40	-0.94847560	54
25	45	-0.49396222	-72
90	50	-0.17367413	65
69	55	-0.40659878	-21
57	60	1.84563626	-12
9	65	0.39405411	-48
72	70	0.79752850	63
26	75	-1.56666536	-46
7	80	-0.08585101	-19
95	85	-0.35913948	88
87	90	-1.19360897	-8
36	95	0.36418674	-51

93（数据操作）：将col1, col2, col3三列顺序颠倒

head( 
    df %>% select(rev(names(df)))
)

A data.frame: 6 × 3
	col3	col2	col1
	<dbl>	<dbl>	<int>
1	0.2352207	0	31
2	-0.3307359	5	79
3	-0.3116238	10	51
4	-2.3023457	15	14
5	-0.1708760	20	67
6	0.1402782	25	42

head(
df %>% select(col3,col2,everything())
)

A data.frame: 6 × 3
	col3	col2	col1
	<dbl>	<dbl>	<int>
1	0.2352207	0	31
2	-0.3307359	5	79
3	-0.3116238	10	51
4	-2.3023457	15	14
5	-0.1708760	20	67
6	0.1402782	25	42

94（数据操作）：提取第一列位置在1,10,15的数

df[c(1,10,15),1]

95（数据操作）：查找第一列的局部最大值位置

rlt = df %>% 
  mutate(diff = sign(col1 - lag(col1)) + sign(col1 - lead(col1))) 
 
which(rlt$diff == 2)

96（数据计算）：按行计算df每一行的均值

rowMeans() 函数

rowMeans(df)    # 或者 apply(df, 1, mean)

10.4117402372046
27.8897547094578
20.2294587253404
8.89921811385688
28.943041318462
22.3800927417011
26.167524448145
25.6632705269829
45.3505081316825
23.1686792609216
46.6087752906765
41.1978004066531
39.6152120879357
24.7980180366582
47.5991761670872
33.1444448799381
28.9713829970591
59.8802868395874
58.6021303444788
43.7880622456741

97（数据计算）：对第二列计算步长为3的移动平均值

library(RcppRoll)
 
df %>%
  summarise(avg_3 = roll_mean(col2, n=3))

A data.frame: 18 × 1
avg_3
<dbl>
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90

98（数据计算）：按第三列值的大小升序排列

df %>% arrange(col3)

A data.frame: 20 × 3
col1	col2	col3
<int>	<dbl>	<dbl>
14	15	-2.30234566
26	75	-1.56666536
50	30	-1.49742666
87	90	-1.19360897
43	35	-1.01018842
97	40	-0.94847560
25	45	-0.49396222
69	55	-0.40659878
95	85	-0.35913948
79	5	-0.33073587
51	10	-0.31162382
90	50	-0.17367413
67	20	-0.17087604
7	80	-0.08585101
42	25	0.14027823
31	0	0.23522071
36	95	0.36418674
9	65	0.39405411
72	70	0.79752850
57	60	1.84563626

99（数据操作）：按第一列大于50的数修改为"高"

df[df$col1 > 50,1] <- '高'

df

A data.frame: 20 × 3
col1	col2	col3
<chr>	<dbl>	<dbl>
31	0	0.23522071
高	5	-0.33073587
高	10	-0.31162382
14	15	-2.30234566
高	20	-0.17087604
42	25	0.14027823
50	30	-1.49742666
43	35	-1.01018842
高	40	-0.94847560
25	45	-0.49396222
高	50	-0.17367413
高	55	-0.40659878
高	60	1.84563626
9	65	0.39405411
高	70	0.79752850
26	75	-1.56666536
7	80	-0.08585101
高	85	-0.35913948
高	90	-1.19360897
36	95	0.36418674

100（数据计算）：计算第一列与第二列的欧氏距离

R语言中dist()与daisy()函数常用用来计算x矩阵行向量之间的距离

dist(t(df[,1:2]))
# dist(rbind(df$col1,df$col2))

Warning message in dist(t(df[, 1:2])):
"强制改变过程中产生了NA"



         col1
col2 181.4497

P101-110

101（数据读取）：从csv文件中读取指定数据：读取前10行， positionName和salary列

注1：该数据是GBK编码，为避免中文乱码，GBK编码的csv或txt用read.csv()读取；UTF-8编码的csv或txt用readr::read_csv()读取；若用read_csv()读取GBK编码文件，需要设置编码（见题目110）。

注2：R中常规读取数据不能在读取时选择列，采用读取之后选择列

read.csv("./data_R/数据1_101-120涉及.csv", nrows = 10) %>% 
  select(positionName, salary)

A data.frame: 10 × 2
positionName	salary
<chr>	<int>
数据分析	37500
数据建模	15000
数据分析	3500
数据分析	45000
数据分析	30000
数据分析	50000
数据分析	30000
数据建模工程师	35000
数据分析专家	60000
数据分析师	40000

102（数据读取）：从csv文件中读取数据，将薪资大于10000的改为"高"

df = read.csv("./data_R/数据2_101-120涉及.csv") %>% 
  mutate(薪资水平 = if_else(薪资水平 > 10000, "高", "低"))
head(df)

# library(readr)
 
# df2 <- read.csv('./data_R/数据2_101-120涉及.csv') %>%
#   mutate('学历要求',
#          '薪资水平' = ifelse(
#            薪资水平 > 10000,'高','低'))

A data.frame: 6 × 2
	学历要求	薪资水平
	<chr>	<chr>
1	本科	高
2	硕士	高
3	本科	低
4	本科	高
5	不限	高
6	硕士	高

103（数据操作）：从df中对薪资水平每隔20行进行抽样

df[seq(1,dim(df)[1],20),]

df %>% 
  slice(seq(1, n(), by = 20))  
# 或者用df[seq(1, nrow(df), 20),]

104（数据操作）：取消使用科学记数法

set.seed(123)
df = tibble(data = runif(10) ^ 10) %>% 
  round(3)
df

A tibble: 10 × 1
data
<dbl>
0.000
0.093
0.000
0.288
0.541
0.000
0.002
0.320
0.003
0.000

105（数据操作）：将上一题的数据转换为百分数

mutate()函数增加新列并保留已存在的列

scales::percent（）将数字类型转换为百分数

df %>% 
  mutate(data = scales::percent(data, 0.01))

A tibble: 10 × 1
data
<chr>
0.00%
9.30%
0.00%
28.80%
54.10%
0.00%
0.20%
32.00%
0.30%
0.00%

106（数据操作）：查找上一题数据中第3大值的行号

sort(x) 是对向量x进行排序，返回值排序后的数值向量。
rank() 是求秩的函数，它的返回值是这个向量中对应元素的“排名”。
而order() 的返回值是指出排序后的向量中各元素在原向量中的索引，默认升序。

order(df$data, decreasing = TRUE)[3]

107（数据操作）：反转df的行

df %>% 
  slice(rev(1:n()))
# 或者 df[rev(1:nrow(df)),]

A tibble: 10 × 1
data
<dbl>
0.000
0.003
0.320
0.002
0.000
0.541
0.288
0.000
0.093
0.000

108（数据连接：全连接）：根据多列匹配合并数据，保留df1和df2的观测

df1 <- tibble(
  key1 = c("K0","K0","K1","K2"),
  key2 = c("K0","K1","K0","K1"),
  A = str_c('A', 0:3),
  B = str_c('B', 0:3)
)
 
df2 <- tibble(
  key1 = c("K0","K1","K1","K2"),
  key2 = str_c("K", rep(0,4)),
  C = str_c('C', 0:3),
  D = str_c('D', 0:3)
)

df1

A tibble: 4 × 4
key1	key2	A	B
<chr>	<chr>	<chr>	<chr>
K0	K0	A0	B0
K0	K1	A1	B1
K1	K0	A2	B2
K2	K1	A3	B3

df2

A tibble: 4 × 4
key1	key2	C	D
<chr>	<chr>	<chr>	<chr>
K0	K0	C0	D0
K1	K0	C1	D1
K1	K0	C2	D2
K2	K0	C3	D3

df1 %>% 
  full_join(df2, by = c("key1", "key2"))

A tibble: 6 × 6
key1	key2	A	B	C	D
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
K0	K0	A0	B0	C0	D0
K0	K1	A1	B1	NA	NA
K1	K0	A2	B2	C1	D1
K1	K0	A2	B2	C2	D2
K2	K1	A3	B3	NA	NA
K2	K0	NA	NA	C3	D3

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UcoCsdKg-1653619964755)(attachment:image.png)]

109（数据连接：左连接）：根据多列匹配合并数据，只保留df1的观测

df1 %>% 
  left_join(df2, by = c("key1", "key2"))

A tibble: 5 × 6
key1	key2	A	B	C	D
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
K0	K0	A0	B0	C0	D0
K0	K1	A1	B1	NA	NA
K1	K0	A2	B2	C1	D1
K1	K0	A2	B2	C2	D2
K2	K1	A3	B3	NA	NA

dplyr包还提供了右连接right_join()，内连接：inner_join()，以及用于过滤的连接：半连接：semi_join()，反连接：anti_join().

110（数据处理）：再次读取数据1并显示所有列

str函数和glimpse都可以方便我们查看数据框内的变量细节

df <- read.csv("./data_R/数据1_101-120涉及.csv") %>%
      glimpse()

Rows: 105
Columns: 53
$ positionId            [3m[90m<int>[39m[23m 6802721, 5204912, 6877668, 6496141, 6467417, 688~
$ positionName          [3m[90m<chr>[39m[23m "数据分析", "数据建模", "数据分析", "数据分析", ~
$ companyId             [3m[90m<int>[39m[23m 475770, 50735, 100125, 26564, 29211, 94826, 3487~
$ companyLogo           [3m[90m<chr>[39m[23m "i/image2/M01/B7/3E/CgoB5lwPfEaAdn8WAABWQ0Jgl5s3~
$ companySize           [3m[90m<chr>[39m[23m "50-150人", "150-500人", "2000人以上", "500-2000~
$ industryField         [3m[90m<chr>[39m[23m "移动互联网,电商", "电商", "移动互联网,企业服务"~
$ financeStage          [3m[90m<chr>[39m[23m "A轮", "B轮", "上市公司", "D轮及以上", "上市公司~
$ companyLabelList      [3m[90m<chr>[39m[23m "['绩效奖金', '带薪年假', '定期体检', '弹性工作'~
$ firstType             [3m[90m<chr>[39m[23m "产品|需求|项目类", "开发|测试|运维类", "产品|需~
$ secondType            [3m[90m<chr>[39m[23m "数据分析", "数据开发", "数据分析", "数据开发", ~
$ thirdType             [3m[90m<chr>[39m[23m "数据分析", "建模", "数据分析", "数据分析", "数~
$ skillLables           [3m[90m<chr>[39m[23m "['SQL', '数据库', '数据运营', 'BI']", "['算法',~
$ positionLables        [3m[90m<chr>[39m[23m "['电商', '社交', 'SQL', '数据库', '数据运营', '~
$ industryLables        [3m[90m<chr>[39m[23m "['电商', '社交', 'SQL', '数据库', '数据运营', '~
$ createTime            [3m[90m<chr>[39m[23m "2020/3/16 11:00", "2020/3/16 11:08", "2020/3/16~
$ formatCreateTime      [3m[90m<chr>[39m[23m "11:00发布", "11:08发布", "10:33发布", "10:10发~
$ district              [3m[90m<chr>[39m[23m "余杭区", "滨江区", "江干区", "江干区", "余杭区"~
$ businessZones         [3m[90m<chr>[39m[23m "['仓前']", "['西兴', '长河']", "['四季青', '钱~
$ salary                [3m[90m<int>[39m[23m 37500, 15000, 3500, 45000, 30000, 50000, 30000, ~
$ workYear              [3m[90m<chr>[39m[23m "1-3年", "3-5年", "1-3年", "3-5年", "3-5年", "1-~
$ jobNature             [3m[90m<chr>[39m[23m "全职", "全职", "全职", "全职", "全职", "全职", ~
$ education             [3m[90m<chr>[39m[23m "本科", "本科", "本科", "本科", "大专", "本科", ~
$ positionAdvantage     [3m[90m<chr>[39m[23m "五险一金、弹性工作、带薪年假、年度体检", "六险~
$ imState               [3m[90m<chr>[39m[23m "today", "disabled", "today", "threeDays", "disa~
$ lastLogin             [3m[90m<chr>[39m[23m "2020/3/16 11:00", "2020/3/16 11:08", "2020/3/16~
$ publisherId           [3m[90m<int>[39m[23m 12022406, 5491688, 5322583, 9814560, 6392394, 11~
$ approve               [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ subwayline            [3m[90m<chr>[39m[23m "", "", "4号线", "1号线", "", "", "", "2号线", "~
$ stationname           [3m[90m<chr>[39m[23m "", "", "江锦路", "文泽路", "", "", "", "丰潭路"~
$ linestaion            [3m[90m<chr>[39m[23m "", "", "4号线_城星路;4号线_市民中心;4号线_江锦~
$ latitude              [3m[90m<dbl>[39m[23m 30.27842, 30.18804, 30.24152, 30.29940, 30.28295~
$ longitude             [3m[90m<dbl>[39m[23m 120.0059, 120.2012, 120.2125, 120.3503, 120.0098~
$ hitags                [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", ~
$ resumeProcessRate     [3m[90m<int>[39m[23m 50, 23, 11, 100, 20, 16, 100, 1, 83, 1, 83, 0, 1~
$ resumeProcessDay      [3m[90m<int>[39m[23m 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ~
$ score                 [3m[90m<int>[39m[23m 233, 176, 80, 68, 66, 66, 65, 47, 24, 18, 17, 17~
$ newScore              [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ matchScore            [3m[90m<dbl>[39m[23m 15.1018750, 32.5594140, 14.9723570, 12.8741530, ~
$ matchScoreExplain     [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ query                 [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ explain               [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ isSchoolJob           [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ~
$ adWord                [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ plus                  [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ pcShow                [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ appShow               [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ deliver               [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ gradeDescription      [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ promotionScoreExplain [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ isHotHire             [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ count                 [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ aggregatePositionIds  [3m[90m<chr>[39m[23m "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", ~
$ famousCompany         [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, T~

P111-120

111（数据操作）：查找secondType与thirdType值相等的行号

which(df$secondType == df$thirdType)

112（数据操作）：查找薪资大于平均薪资的第三个数据

df %>% 
  filter(salary > mean(salary)) %>% 
  slice(3)

A data.frame: 1 × 53
positionId	positionName	companyId	companyLogo	companySize	industryField	financeStage	companyLabelList	firstType	secondType	...	plus	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany
<int>	<chr>	<int>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	...	<lgl>	<int>	<int>	<int>	<lgl>	<lgl>	<int>	<int>	<chr>	<lgl>
6882347	数据分析	94826	image2/M00/04/12/CgpzWlXwKIKAaJd9AAARGoomCu4858.jpg?cc=0.6776386289857328	50-150人	移动互联网,社交	B轮	['股票期权', '扁平管理', '五险一金', '岗位晋升']	产品\|需求\|项目类	数据分析	...	NA	0	0	0	NA	NA	0	0	[]	FALSE

113（数据操作）：将上一题数据的salary列开根号

head(
df %>% 
  mutate(salary_sqrt = sqrt(salary)) %>% 
  select(salary, salary_sqrt)
)

A data.frame: 6 × 2
	salary	salary_sqrt
	<int>	<dbl>
1	37500	193.6492
2	15000	122.4745
3	3500	59.1608
4	45000	212.1320
5	30000	173.2051
6	50000	223.6068

114（数据操作）：将上一题数据的linestation列按_拆分

head(

df %>% 
  separate(linestaion, into = c("line", "station"), sep = "_", remove = FALSE) %>% 
  select(linestaion, line, station)
)

Warning message:
"Expected 2 pieces. Additional pieces discarded in 41 rows [3, 8, 10, 12, 13, 20, 21, 22, 23, 27, 29, 30, 31, 32, 34, 36, 37, 38, 41, 44, ...]."
Warning message:
"Expected 2 pieces. Missing pieces filled with `NA` in 60 rows [1, 2, 5, 6, 7, 9, 11, 14, 15, 16, 17, 18, 19, 24, 25, 26, 28, 33, 35, 39, ...]."

A data.frame: 6 × 3
	linestaion	line	station
	<chr>	<chr>	<chr>
1			NA
2			NA
3	4号线_城星路;4号线_市民中心;4号线_江锦路	4号线	城星路;4号线
4	1号线_文泽路	1号线	文泽路
5			NA
6			NA

head(
df %>%
  mutate(split = str_split(linestaion,'_'))
)

A data.frame: 6 × 54
	positionId	positionName	companyId	companyLogo	companySize	industryField	financeStage	companyLabelList	firstType	secondType	...	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany	split
	<int>	<chr>	<int>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	...	<int>	<int>	<int>	<lgl>	<lgl>	<int>	<int>	<chr>	<lgl>	<list>
1	6802721	数据分析	475770	i/image2/M01/B7/3E/CgoB5lwPfEaAdn8WAABWQ0Jgl5s31.jpeg	50-150人	移动互联网,电商	A轮	['绩效奖金', '带薪年假', '定期体检', '弹性工作']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE
2	5204912	数据建模	50735	image1/M00/00/85/CgYXBlTUXeeAR0IjAABbroUk-dw977.png	150-500人	电商	B轮	['年终奖金', '做五休二', '六险一金', '子女福利']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	FALSE
3	6877668	数据分析	100125	image2/M00/0C/57/CgqLKVYcOA2ADcFuAAAE8MukIKA745.png?cc=0.8089290193747729	2000人以上	移动互联网,企业服务	上市公司	['节日礼物', '年底双薪', '股票期权', '带薪年假']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE	4号线 , 城星路;4号线 , 市民中心;4号线, 江锦路
4	6496141	数据分析	26564	i/image2/M01/F7/3F/CgoB5lyGAQGAZeI-AAAdOqXecnw673.jpg	500-2000人	电商	D轮及以上	['生日趴', '每月腐败基金', '每月补贴', '年度旅游']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	TRUE	1号线 , 文泽路
5	6467417	数据分析	29211	i/image2/M01/77/B8/CgoB5l1WDyGATNP5AAAlY3h88SY623.png	2000人以上	物流丨运输	上市公司	['技能培训', '免费班车', '专项奖金', '岗位晋升']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	TRUE
6	6882347	数据分析	94826	image2/M00/04/12/CgpzWlXwKIKAaJd9AAARGoomCu4858.jpg?cc=0.6776386289857328	50-150人	移动互联网,社交	B轮	['股票期权', '扁平管理', '五险一金', '岗位晋升']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE

115（数据查看）：查看上一题数据一共有多少列

ncol(df)

116（数据操作）：提取industryField列以"数据"开头的行

head(
df[grep("^数据", df$industryField),]
)

A data.frame: 6 × 54
	positionId	positionName	companyId	companyLogo	companySize	industryField	financeStage	companyLabelList	firstType	secondType	...	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany	split
	<int>	<chr>	<int>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	...	<int>	<int>	<int>	<lgl>	<lgl>	<int>	<int>	<chr>	<lgl>	<list>
9	6458372	数据分析专家	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE
11	6804629	数据分析师	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE
14	6804489	资深数据分析师	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	FALSE
22	6267370	数据分析专家	31544	image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png	150-500人	数据服务	不需要融资	['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	FALSE	4号线 , 中医药大学;4号线, 联庄
33	6804489	资深数据分析师	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	FALSE
38	6242470	数据分析师	31544	image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png	150-500人	数据服务	不需要融资	['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE	4号线 , 中医药大学;4号线, 联庄

head(
df %>% 
  filter(str_detect(industryField, "^数据"))
)

A data.frame: 6 × 54
	positionId	positionName	companyId	companyLogo	companySize	industryField	financeStage	companyLabelList	firstType	secondType	...	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany	split
	<int>	<chr>	<int>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	...	<int>	<int>	<int>	<lgl>	<lgl>	<int>	<int>	<chr>	<lgl>	<list>
1	6458372	数据分析专家	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE
2	6804629	数据分析师	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE
3	6804489	资深数据分析师	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	FALSE
4	6267370	数据分析专家	31544	image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png	150-500人	数据服务	不需要融资	['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	FALSE	4号线 , 中医药大学;4号线, 联庄
5	6804489	资深数据分析师	34132	i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	开发\|测试\|运维类	数据开发	...	0	0	0	NA	NA	0	0	[]	FALSE
6	6242470	数据分析师	31544	image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png	150-500人	数据服务	不需要融资	['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升']	产品\|需求\|项目类	数据分析	...	0	0	0	NA	NA	0	0	[]	FALSE	4号线 , 中医药大学;4号线, 联庄

117（数据分组汇总）：以salary score和positionID做数据透视表

df %>% 
  group_by(positionId) %>% 
  summarise(salary_avg = mean(salary),
            score_avg = mean(score))

A data.frame: 1 × 2
salary_avg	score_avg
<dbl>	<dbl>
31723.81	12.71429

df1 <- df %>%
  group_by(positionId) %>%
  dplyr::summarise(salary = mean(salary),
            score = mean(score)) %>%
  as.data.frame(.)
rownames(df) <- NULL
head(
tibble::column_to_rownames(df1,var='positionId')
)

A data.frame: 6 × 2
	salary	score
	<dbl>	<dbl>
5203054	30000	4
5204912	15000	176
5269002	37500	1
5453691	30000	4
5519962	37500	14
5520623	30000	6

118（数据分组汇总）：同时对salary、score两列进行汇总计算

df1 %>% 
  summarise_at(vars(salary, score), list(~sum(.), ~mean(.), ~min(.)))

A data.frame: 1 × 6
salary_sum	score_sum	salary_mean	score_mean	salary_min	score_min
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
3009750	1275.5	31681.58	13.42632	3500	0

119（数据分组汇总）：同时对不同列进行不同的汇总计算：对salary求平均，对score求和

df1 %>% 
  summarise(salary_avg = mean(salary),
            score_sum = sum(score))
#注：若要分组再这样汇总，前面加上group_by(grpvar)即可。

A data.frame: 1 × 2
salary_avg	score_sum
<dbl>	<dbl>
31681.58	1275.5

120（数据分组汇总）：计算并提取平均薪资最高的区

head(df)

A data.frame: 6 × 53
	positionId	positionName	companyId	companyLogo	companySize	industryField	financeStage	companyLabelList	firstType	secondType	...	plus	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany
	<int>	<chr>	<int>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	...	<lgl>	<int>	<int>	<int>	<lgl>	<lgl>	<int>	<int>	<chr>	<lgl>
1	6802721	数据分析	475770	i/image2/M01/B7/3E/CgoB5lwPfEaAdn8WAABWQ0Jgl5s31.jpeg	50-150人	移动互联网,电商	A轮	['绩效奖金', '带薪年假', '定期体检', '弹性工作']	产品\|需求\|项目类	数据分析	...	NA	0	0	0	NA	NA	0	0	[]	FALSE
2	5204912	数据建模	50735	image1/M00/00/85/CgYXBlTUXeeAR0IjAABbroUk-dw977.png	150-500人	电商	B轮	['年终奖金', '做五休二', '六险一金', '子女福利']	开发\|测试\|运维类	数据开发	...	NA	0	0	0	NA	NA	0	0	[]	FALSE
3	6877668	数据分析	100125	image2/M00/0C/57/CgqLKVYcOA2ADcFuAAAE8MukIKA745.png?cc=0.8089290193747729	2000人以上	移动互联网,企业服务	上市公司	['节日礼物', '年底双薪', '股票期权', '带薪年假']	产品\|需求\|项目类	数据分析	...	NA	0	0	0	NA	NA	0	0	[]	FALSE
4	6496141	数据分析	26564	i/image2/M01/F7/3F/CgoB5lyGAQGAZeI-AAAdOqXecnw673.jpg	500-2000人	电商	D轮及以上	['生日趴', '每月腐败基金', '每月补贴', '年度旅游']	开发\|测试\|运维类	数据开发	...	NA	0	0	0	NA	NA	0	0	[]	TRUE
5	6467417	数据分析	29211	i/image2/M01/77/B8/CgoB5l1WDyGATNP5AAAlY3h88SY623.png	2000人以上	物流丨运输	上市公司	['技能培训', '免费班车', '专项奖金', '岗位晋升']	产品\|需求\|项目类	数据分析	...	NA	0	0	0	NA	NA	0	0	[]	TRUE
6	6882347	数据分析	94826	image2/M00/04/12/CgpzWlXwKIKAaJd9AAARGoomCu4858.jpg?cc=0.6776386289857328	50-150人	移动互联网,社交	B轮	['股票期权', '扁平管理', '五险一金', '岗位晋升']	产品\|需求\|项目类	数据分析	...	NA	0	0	0	NA	NA	0	0	[]	FALSE

df2 <- data.frame(df[,'district'],df[,'salary'])
names(df2) <- c("district",'salary')
head(df2)

A data.frame: 6 × 2
	district	salary
	<chr>	<int>
1	余杭区	37500
2	滨江区	15000
3	江干区	3500
4	江干区	45000
5	余杭区	30000
6	余杭区	50000

df2 %>% 
  group_by(district) %>% 
  summarise(salary_avg = mean(salary)) %>% 
  top_n(1, salary_avg)

A data.frame: 1 × 1
salary_avg
<dbl>
31723.81

df %>%
  group_by(district) %>%
  summarise(avg = mean(salary)) %>%
  arrange(desc(avg)) %>%
  filter(row_number() == 1)

A data.frame: 1 × 1
avg
<dbl>
31723.81