R语言数据分析120题

文章目录

R语言数据分析120题

library(tidyverse)
library(zoo)
library(writexl)

P1-20

1(创建数据框):将下面的字典创建为DataFrame

data = {“grammer”: [“Python”,“C”,“Java”,“GO”,np.nan,“SQL”,“PHP”,“Python”], “score”:[1,2,np.nan,4,5,6,7,10]}

library(tibble)
df <- tibble( 
grammer = c("Python","C","Java","GO", NA,"SQL","PHP","Python"), 
    score = c(1,2,NA,4,5,6,7,10)
 )
df
A tibble: 8 × 2
grammerscore
<chr><dbl>
Python 1
C 2
Java NA
GO 4
NA 5
SQL 6
PHP 7
Python10
df_1 <- data.frame( 
grammer = c("Python","C","Java","GO", NA,"SQL","PHP","Python"), 
    score = c(1,2,NA,4,5,6,7,10)
 )
df_1
A data.frame: 8 × 2
grammerscore
<chr><dbl>
Python 1
C 2
Java NA
GO 4
NA 5
SQL 6
PHP 7
Python10
2(筛选行):提取含有字符串"Python"的行
整个
df %>% 
    filter(grammer == "Python")
A tibble: 2 × 2
grammerscore
<chr><dbl>
Python 1
Python10
包含部分字符

R语言使用dplyr提取行列:https://www.cnblogs.com/chenwenyan/p/15261246.html

df_1 <- dplyr::filter(df, grepl('P', grammer))
df_1
A tibble: 3 × 2
grammerscore
<chr><dbl>
Python 1
PHP 7
Python10
#sessionInfo()  查看R版本
#R.Version()
3(查看列名):输出df的所有列名
names(df)
  1. 'grammer'
  2. 'score'
4(修改列名):修改第二列列名为popularity

重命名函数rename()

df = df %>% rename(popularity = score)
names(df)
  1. 'grammer'
  2. 'popularity'
5(统计频数):统计grammer列中每一种编程语言出现的次数

计数函数count()

df %>% count(grammer)
A tibble: 7 × 2
grammern
<chr><int>
C 1
GO 1
Java 1
PHP 1
Python2
SQL 1
NA 1
6(缺失值处理):将空值用上下值的平均值填充

https://blog.csdn.net/weixin_51077152/article/details/119945978

(1)判断元素中是否含有缺失值:is.na()函数

(2)去掉数据集中的缺失值(行删除):
complete.cases() 存储没有缺失值的数据框

na.omit()函数 去掉有缺失值的行
na.omit(object,cols,invert)
object::数据集
cols:一个由列名(或数字)组成的向量,用于检查缺失值。默认值是所有列。
invert:如果FALSE将省略所有缺少值的行(默认)。TRUE只返回那些缺少值的行。

(3)计算每一列/每一行的缺失值数,用sleep数据集
sleep数据集引用之前需要先下载程序包VIM

na.approx (in zoo package):计算插值替换NA值

df = df %>% mutate(popularity = zoo::na.approx(popularity))
df
A tibble: 8 × 2
grammerpopularity
<chr><dbl>
Python 1
C 2
Java 3
GO 4
NA 5
SQL 6
PHP 7
Python10

mutate函数,关于mutate函数补充如下:

# 删除grammer列的数据,即将null值赋予列
df %>%
  mutate(grammer=NULL)
A tibble: 8 × 1
popularity
<dbl>
1
2
3
4
5
6
7
10
7(筛选行):提取popularity列中值大于3的行
df %>% filter(popularity >3)
A tibble: 5 × 2
grammerpopularity
<chr><dbl>
GO 4
NA 5
SQL 6
PHP 7
Python10
8(数据去重):按grammer列进行去重

distinct()过滤多余的重复记录只保留一条
.keep_all = TRUE显示所有的变量, = FALSE只显示降重列

df %>% distinct(grammer, .keep_all = TRUE)
A tibble: 7 × 2
grammerpopularity
<chr><dbl>
Python1
C 2
Java 3
GO 4
NA 5
SQL 6
PHP 7
9(数据计算):计算popularity列平均值
df %>%
  summarise(popularity_avg = mean(popularity))
  # summarise()创建新的data frame
  # 可以实现分类汇总
A tibble: 1 × 1
popularity_avg
<dbl>
4.75
10(格式转换):将grammer列转换为序列
df$grammer
  1. 'Python'
  2. 'C'
  3. 'Java'
  4. 'GO'
  5. NA
  6. 'SQL'
  7. 'PHP'
  8. 'Python'
11(数据保存):将数据框保存为Excel
writexl::write_xlsx(df, "filename.xlsx")
12(数据查看):查看数据的行数列数
dim(df)
  1. 8
  2. 2
13(筛选行):提取popularity列值大于3小于7的行
df %>%
  filter(popularity > 3 & popularity < 7)
A tibble: 3 × 2
grammerpopularity
<chr><dbl>
GO 4
NA 5
SQL6
14(调整列位置):交互两列的位置
df %>%
  select(popularity, grammer)
  # select()仅保留提到的变量
A tibble: 8 × 2
popularitygrammer
<dbl><chr>
1Python
2C
3Java
4GO
5NA
6SQL
7PHP
10Python
15(筛选行):提取popularity列最大值所在的行
df %>%
  filter(popularity == max(popularity))
A tibble: 1 × 2
grammerpopularity
<chr><dbl>
Python10
16(查看数据):查看最后几行数据
tail(df)
head(df) 
# tail默认最后6行,head默认前6行
A tibble: 6 × 2
grammerpopularity
<chr><dbl>
Java 3
GO 4
NA 5
SQL 6
PHP 7
Python10
A tibble: 6 × 2
grammerpopularity
<chr><dbl>
Python1
C 2
Java 3
GO 4
NA 5
SQL 6
17(修改数据):删除最后一行数据
df %>% 
  slice(-n())
  # 删除第n行的数据,即最后一行的数据
A tibble: 7 × 2
grammerpopularity
<chr><dbl>
Python1
C 2
Java 3
GO 4
NA 5
SQL 6
PHP 7

在这里用的是slice函数,关于slice函数补充如下:

# 如果选择某一特定行数据,例如选取第三行:
df %>% 
  slice(3)
# 如果删去某一特定行数据,例如删去第三行:
df %>% 
  slice(-3)
# 如果选择特定几行数据,例如选取第3-5行:
df %>% 
  slice(3:5)
# 如果删去特定几行数据,例如删除第3-5行:
df %>% 
  slice(-(3:5))
A tibble: 1 × 2
grammerpopularity
<chr><dbl>
Java3
A tibble: 7 × 2
grammerpopularity
<chr><dbl>
Python 1
C 2
GO 4
NA 5
SQL 6
PHP 7
Python10
A tibble: 3 × 2
grammerpopularity
<chr><dbl>
Java3
GO 4
NA 5
A tibble: 5 × 2
grammerpopularity
<chr><dbl>
Python 1
C 2
SQL 6
PHP 7
Python10
18(修改数据):添加一行数据:“Perl”, 6
newrow = tibble(grammer="Perl", popularity=6)
  # 定义新的newrow,a tibble:1*2
df %>%
  bind_rows(newrow)
  # 使用bind_rows(),合并两个tibble
A tibble: 9 × 2
grammerpopularity
<chr><dbl>
Python 1
C 2
Java 3
GO 4
NA 5
SQL 6
PHP 7
Python10
Perl 6
19(数据整理):对数据按popularity列值从大到小排序
df %>%
  arrange(desc(popularity))
A tibble: 8 × 2
grammerpopularity
<chr><dbl>
Python10
PHP 7
SQL 6
NA 5
GO 4
Java 3
C 2
Python 1

arrange函数(排序函数) :
arrange(desc(data)) 从大到小
arrange(data) 从小到大

# 对数据按popularity列值从到小到大排序
df %>%
  arrange(popularity)
A tibble: 8 × 2
grammerpopularity
<chr><dbl>
Python 1
C 2
Java 3
GO 4
NA 5
SQL 6
PHP 7
Python10
20(字符统计):统计grammer列每个字符串的长度

mutate()函数对数据进行增加列

df %>%
  mutate(strlen = str_length(grammer))
  # 使用mutate()函数对数据进行增加列
  # 定义strlen为新列,赋值为grammer的字符串长度
A tibble: 8 × 3
grammerpopularitystrlen
<chr><dbl><int>
Python 1 6
C 2 1
Java 3 4
GO 4 2
NA 5NA
SQL 6 3
PHP 7 3
Python10 6

P21-30

21(读取数据):读取本地Excel数据
df = readxl::read_xlsx("./data_R/21-50数据.xlsx")
# 需要保证数据在当前路径,可通过setwd()设置
22(查看数据):查看df数据的前几行(后几行)
head(df)
#tail(df)
A tibble: 6 × 3
createTimeeducationsalary
<dttm><chr><chr>
2020-03-16 11:30:18本科20k-35k
2020-03-16 10:58:48本科20k-40k
2020-03-16 10:46:39不限20k-35k
2020-03-16 10:45:44本科13k-20k
2020-03-16 10:20:41本科10k-20k
2020-03-16 10:33:48本科10k-18k
23(数据计算):将salary列数据转换为最大值与最小值的平均值
df = df %>%
    separate(salary, into = c("low", "high"), sep = "-")     
    # 先将salary列分成两列low和high
df = df %>%   
    mutate(salary = (parse_number(low) + parse_number(high)) * 1000 / 2)
    # 将low和high两列的数字部分运算,赋值给salary
    # parse_number()指提取变量中的数字部分
df = df %>%
  select(-c(low, high))
    # 删除low和high列
    # 也可以用#20的mutate函数删除列 
head(df)
A tibble: 6 × 3
createTimeeducationsalary
<dttm><chr><dbl>
2020-03-16 11:30:18本科27500
2020-03-16 10:58:48本科30000
2020-03-16 10:46:39不限27500
2020-03-16 10:45:44本科16500
2020-03-16 10:20:41本科15000
2020-03-16 10:33:48本科14000
24(分组汇总):根据学历分组,并计算平均薪资
df %>% 
  group_by(education) %>% summarise(salary_avg = mean(salary))
    # 将数据中的education相同取值的数据进行分组聚类,tiblle属性包括有Groups: education [4]
      # 创建新的数据框data frame,包含有group,salary_avg
A tibble: 4 × 2
educationsalary_avg
<chr><dbl>
本科19361.34
不限19600.00
大专10000.00
硕士20642.86

summarise函数(聚合函数),关于summarise函数补充如下:

# 创建新的数据框data frame,命名为salary,包含有group,salary_avg,salary_max,salary_min
salary=df %>% 
  group_by(education) %>% 
  summarise(salary_avg = mean(salary),salary_max=max(salary),salary_min=min(salary))
salary
A tibble: 4 × 4
educationsalary_avgsalary_maxsalary_min
<chr><dbl><dbl><dbl>
本科19361.3445000 3500
不限19600.0030000 3500
大专10000.0015000 5000
硕士20642.863750012500
25(时间转换):将createTime列转换为"月-日"
library(lubridate)

head(
    
df %>% 
  mutate(createTime = str_c(month(createTime), "-", day(createTime)))
)

  # 增加列createTime,包含有月份-日期
  # str_c合并字符串
A tibble: 6 × 3
createTimeeducationsalary
<chr><chr><dbl>
3-16本科27500
3-16本科30000
3-16不限27500
3-16本科16500
3-16本科15000
3-16本科14000
26(查看数据):查看数据结构信息
df %>% 
  glimpse()
  # 查看数据结构信息
object.size(df)
  # 查看对象占用内存
Rows: 135
Columns: 3
$ createTime [3m[90m<dttm>[39m[23m 2020-03-16 11:30:18, 2020-03-16 10:58:48, 2020-03-16 10:46~
$ education  [3m[90m<chr>[39m[23m "本科", "本科", "不限", "本科", "本科", "本科", "硕士", "本~
$ salary     [3m[90m<dbl>[39m[23m 27500, 30000, 27500, 16500, 15000, 14000, 23000, 12500, 700~



5112 bytes
27(查看数据):查看数据汇总信息
summary(df)
   createTime                   education             salary     
 Min.   :2020-03-13 18:01:31   Length:135         Min.   : 3500  
 1st Qu.:2020-03-16 10:41:19   Class :character   1st Qu.:14000  
 Median :2020-03-16 11:00:27   Mode  :character   Median :17500  
 Mean   :2020-03-16 10:16:35                      Mean   :19159  
 3rd Qu.:2020-03-16 11:19:03                      3rd Qu.:25000  
 Max.   :2020-03-16 11:36:07                      Max.   :45000  
28(修改列):新增一列将salary离散化为三水平值

case_when函数,允许定义多个if_else()语句,等效于SQL中CASE WHEN 语句

df = df %>%
  mutate(class = case_when(
    salary >= 0 & salary < 5000      ~ "低",
    salary >= 5000 & salary < 20000 ~ "中",
TRUE                                ~ "高"))      
  # 如果salary的取值在[0,5000),赋值class=低;
  # 如果salary的取值在[5000,20000),赋值class=中;
  # 如果salary的取值在[20000,+∞),赋值class=高;
  # TRUE的效果是其它,等效于salry>=20000
head(df)
A tibble: 6 × 4
createTimeeducationsalaryclass
<dttm><chr><dbl><chr>
2020-03-16 11:30:18本科27500
2020-03-16 10:58:48本科30000
2020-03-16 10:46:39不限27500
2020-03-16 10:45:44本科16500
2020-03-16 10:20:41本科15000
2020-03-16 10:33:48本科14000
29(数据整理):按salary列对数据降序排列
df = df %>% 
  arrange(desc(salary))
  # 同#19
30(筛选行):提取第33行数据
df[33,]
A tibble: 1 × 4
createTimeeducationsalaryclass
<dttm><chr><dbl><chr>
2020-03-16 10:54:56本科25000

P31-40

31(数据计算):计算salary列的中位数

计算中位数median()

median(df$salary)

17500

df %>% 
  summarise(salary_med = median(salary))
# 创建新的tibble 1*1,将salary列的中位数赋值给salary_med
A tibble: 1 × 1
salary_med
<dbl>
17500
32(数据可视化):绘制salary列的频率分布直方图
png('./data_R/out/32_直方图.png')
df %>%
  ggplot(aes(x = salary)) +
  geom_histogram(bins = 10)
  # salary设定为x轴绘图,设定图形格式为直方图,分为10组
dev.off()

png: 2

33(数据可视化):绘制salary的频率密度曲线图
png('./data_R/out/33_曲线图.png')
df %>%
  ggplot(aes(x = salary)) +
  geom_density()
  # 同#32,设定图形格式为密度曲线
dev.off()

png: 2

34(数据删除):删除列class
head(
    df %>% 
      select(-class)
)
# 选择class列删除
# 或者同#6的补充,给class列赋空值即删去,如下

head(
    df %>%
      mutate(class = NULL)
)

A tibble: 6 × 3
createTimeeducationsalary
<dttm><chr><dbl>
2020-03-16 11:30:17本科45000
2020-03-16 11:04:00本科40000
2020-03-16 10:36:57本科37500
2020-03-16 11:01:39本科37500
2020-03-16 09:54:47硕士37500
2020-03-16 11:01:22本科35000
A tibble: 6 × 3
createTimeeducationsalary
<dttm><chr><dbl>
2020-03-16 11:30:17本科45000
2020-03-16 11:04:00本科40000
2020-03-16 10:36:57本科37500
2020-03-16 11:01:39本科37500
2020-03-16 09:54:47硕士37500
2020-03-16 11:01:22本科35000
35(数据操作):将df的第1列与第2列合并为新的一列

unite函数
"newcol"为合并列的新命名,1:2为合并1、2列,sep = " "为合并中间间隔空格
合并命名为newcol,1、2、3列,合并列由逗号区分

head(
    df %>% 
      unite("newcol", 1:2, sep = " ")
)
A tibble: 6 × 3
newcolsalaryclass
<chr><dbl><chr>
2020-03-16 11:30:17 本科45000
2020-03-16 11:04:00 本科40000
2020-03-16 10:36:57 本科37500
2020-03-16 11:01:39 本科37500
2020-03-16 09:54:47 硕士37500
2020-03-16 11:01:22 本科35000
36(数据操作):将education列与第salary列合并为新的一列
head(
    df %>% unite("newcol", c(education, salary), sep = " ")
    )
  # 同#35
A tibble: 6 × 3
createTimenewcolclass
<dttm><chr><chr>
2020-03-16 11:30:17本科 45000
2020-03-16 11:04:00本科 40000
2020-03-16 10:36:57本科 37500
2020-03-16 11:01:39本科 37500
2020-03-16 09:54:47硕士 37500
2020-03-16 11:01:22本科 35000
37(数据计算):计算salary最大值与最小值之差
max(df$salary) - min(df$salary)

41500

38(数据操作):将第一行与最后一行拼接
bind_rows(df[1,], df[nrow(df),])
# 第一行df[1,]
# 最后一行df[nrow(df),]
A tibble: 2 × 4
createTimeeducationsalaryclass
<dttm><chr><dbl><chr>
2020-03-16 11:30:17本科45000
2020-03-16 10:48:43本科 3500
39(数据操作):将第8行添加到末尾
bind_rows(df, df[8,]) %>%  tail()
#将第八行合并至末尾,同#38
# 显示末尾行
A tibble: 6 × 4
createTimeeducationsalaryclass
<dttm><chr><dbl><chr>
2020-03-16 11:20:44本科 4500
2020-03-16 11:12:04本科 4000
2020-03-16 11:12:04本科 4000
2020-03-16 10:44:23不限 3500
2020-03-16 10:48:43本科 3500
2020-03-16 10:41:20本科35000
40(查看数据):查看每一列的数据类型
df %>% 
  glimpse()
  # 同#26
Rows: 135
Columns: 4
$ createTime [3m[90m<dttm>[39m[23m 2020-03-16 11:30:17, 2020-03-16 11:04:00, 2020-03-16 10:36~
$ education  [3m[90m<chr>[39m[23m "本科", "本科", "本科", "本科", "硕士", "本科", "本科", "本~
$ salary     [3m[90m<dbl>[39m[23m 45000, 40000, 37500, 37500, 37500, 35000, 35000, 35000, 325~
$ class      [3m[90m<chr>[39m[23m "高", "高", "高", "高", "高", "高", "高", "高", "高", "高",~

P41-50

41(数据操作):将createTime列设置为行索引
head(
df %>% distinct(createTime, .keep_all = TRUE) %>% column_to_rownames("createTime")
)
A data.frame: 6 × 3
educationsalaryclass
<chr><dbl><chr>
2020-03-16 11:30:17本科45000
2020-03-16 11:04:00本科40000
2020-03-16 10:36:57本科37500
2020-03-16 11:01:39本科37500
2020-03-16 09:54:47硕士37500
2020-03-16 11:01:22本科35000
42(数据创建):生成一个和df长度相同的随机数数据框
df1 = tibble(rnums = sample.int(10, nrow(df), replace = TRUE))
head(df1)
# 创建这样的一个tibble数据框,将随机数赋值给irnums,要求为:数据为10以内正整数,数量同df,可以重复
A tibble: 6 × 1
rnums
<int>
9
10
7
9
5
4
43(数据连接):将上面生成的数据框与df按列合并
df = bind_cols(df, df1)
head(df)
A tibble: 6 × 5
createTimeeducationsalaryclassrnums
<dttm><chr><dbl><chr><int>
2020-03-16 11:30:17本科45000 9
2020-03-16 11:04:00本科4000010
2020-03-16 10:36:57本科37500 7
2020-03-16 11:01:39本科37500 9
2020-03-16 09:54:47硕士37500 5
2020-03-16 11:01:22本科35000 4
44(修改列):生成新列new为salary列减去随机数列
df = df %>% 
  mutate(new = salary - rnums)
45(检查缺失值):检查数据中是否含有任何缺失值
anyNA(df)

FALSE

46(类型转换):将rnums列的类型转换为浮点数
head(
    df %>% mutate(rnums = as.double(rnums))
  # rnums为int型,使用as.double进行转换,其他形式也类比
)

A tibble: 6 × 6
createTimeeducationsalaryclassrnumsnew
<dttm><chr><dbl><chr><dbl><dbl>
2020-03-16 11:30:17本科45000 944991
2020-03-16 11:04:00本科400001039990
2020-03-16 10:36:57本科37500 737493
2020-03-16 11:01:39本科37500 937491
2020-03-16 09:54:47硕士37500 537495
2020-03-16 11:01:22本科35000 434996
47(数据汇总):计算salary列大于10000的次数
df %>% 
  count(salary > 10000)
  # 同#5
A tibble: 2 × 2
salary > 10000n
<lgl><int>
FALSE 16
TRUE119
  # 或者利用summarise函数进行汇总,如下
df %>% 
  summarise(n = sum(salary > 10000))
A tibble: 1 × 1
n
<int>
119
48(统计频数):查看每种学历出现的次数
df %>% count(education)
A tibble: 4 × 2
educationn
<chr><int>
本科119
不限 5
大专 4
硕士 7
table(df$education)


本科 不限 大专 硕士
119 5 4 7

49(数据汇总):查看education列共有几种学历
df %>% distinct(education)
# 同#8,此题默认.keep_all = FALSE,也就是除去education都不显示
A tibble: 4 × 1
education
<chr>
本科
硕士
不限
大专
50(筛选行):提取salary与new列之和大于60000的最后3行
df %>% filter(salary + new > 60000) %>% slice((n()-2):n())
# filter过滤出salary+new>6000的行
# slice选择其中的后三行
A tibble: 3 × 6
createTimeeducationsalaryclassrnumsnew
<dttm><chr><dbl><chr><int><dbl>
2020-03-16 10:41:20本科35000434996
2020-03-16 10:41:20本科35000734993
2020-03-16 11:30:10本科32500832492

P51-60

51(读取数据):使用绝对路径读取本地Excel数据
df <- readxl::read_xls("C:/Users/86156/.jupyter/Code/Jupyter-Notebook/R_Python/data_R/51-80数据.xls")
head(df, 3)
Registered S3 methods overwritten by 'tibble':
  method     from  
  format.tbl pillar
  print.tbl  pillar

A tibble: 3 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><chr><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_1_416.135616.144416.144415.499715.720542240610754425783 -0.4151-2.572517.8601999999999990.226399999999999993.32e+113.32e+11186534714156.5614
600000.SH浦发银行2016_1_515.720515.464415.950115.367215.8618580547931034181474 0.1413 0.898917.8139 0.311199999999999983.35e+113.35e+11186534714156.6204
600000.SH浦发银行2016_1_615.861815.808816.020815.623415.985546772653838667398 0.1236 0.779517.9307000000000020.250699999999999983.38e+113.38e+11186534714156.6720
#df <- read.csv('./data_R/51-80数据.csv')
52(查看数据):查看数据框的前3行
#head(df,3)
df %>% head(3)
A tibble: 3 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><chr><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_1_416.135616.144416.144415.499715.720542240610754425783 -0.4151-2.572517.8601999999999990.226399999999999993.32e+113.32e+11186534714156.5614
600000.SH浦发银行2016_1_515.720515.464415.950115.367215.8618580547931034181474 0.1413 0.898917.8139 0.311199999999999983.35e+113.35e+11186534714156.6204
600000.SH浦发银行2016_1_615.861815.808816.020815.623415.985546772653838667398 0.1236 0.779517.9307000000000020.250699999999999983.38e+113.38e+11186534714156.6720
53(查看缺失值):查看每列数据缺失值情况

naniar包探索缺失值的数据结构

library(naniar)
#colSums(is.na(df))
df %>% miss_var_summary()
A tibble: 18 × 3
variablen_misspct_miss
<chr><int><dbl>
代码 00
简称 00
日期 00
前收盘价(元) 00
开盘价(元) 00
最高价(元) 00
最低价(元) 00
收盘价(元) 00
成交量(股) 00
成交金额(元) 00
涨跌(元) 00
涨跌幅(%) 00
均价(元) 00
换手率(%) 00
A股流通市值(元)00
总市值(元) 00
A股流通股本(股)00
市盈率 00
54(查看缺失值):查看日期列含有缺失值的行
df %>% 
  filter(is.na(日期))

which(is.na(df$日期))   # 列缺失的行号
A tibble: 0 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><chr><dbl><dbl><dbl><dbl>
55(查看缺失值):查看每列缺失值在哪些行
1
naIdx = df %>%
  where_na()         # 返回NA的行列索引, 需要naniar包
 
split(naIdx[,1], naIdx[,2])
2
library(glue)
 
for (i in names(df)){
  if(sum(is.na(df[,'日期'])) != 0){
    res1 <- which(is.na(df[,i]))
    res2 <- paste(res1,collapse = ',')
    print(glue('列名:"{i}", 第[{res2}]行有缺失值'))
  }
}
载入程辑包:'glue'


The following object is masked from ‘package:dplyr’:

    collapse


56(缺失值处理):删除所有存在缺失值的行

注:若要删除某些列包含缺失值的行,提供列名即可

#df %>% drop_na()
57(数据可视化):绘制收盘价的折线图
library(ggplot2)

png('./data_R/out/57_折线图.png')
df %>% ggplot(aes(日期, `收盘价(元)`,group = 1)) + geom_line()     #必须加上group = 1
dev.off()

png: 2

58(数据可视化):同时绘制开盘价与收盘价
png('./data_R/out/58_折线图.png')
df %>%
  select(日期, `开盘价(元)`, `收盘价(元)`) %>% 
  pivot_longer(-日期, 
               names_to = "type",
               values_to = "price") %>%
  ggplot(aes(日期, price, color = type, group = 1)) +    #必须加上group = 1,不然图片出不来
  geom_line() 

dev.off()

png: 2

59(数据可视化):绘制涨跌幅的直方图
png('./data_R/out/59_涨跌幅直方图.png')

df %>% ggplot(aes(`涨跌幅(%)`)) + geom_histogram()
# 可以指定bins
dev.off()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

png: 2

60(数据可视化):让直方图更细致
png('./data_R/out/60_涨跌幅直方图.png')

df %>% 
  ggplot(aes(`涨跌幅(%)`)) +
  geom_histogram(bins = 40)

dev.off()

png: 2

P61-70

61(数据创建):用df的列名创建数据框
as_tibble(names(df))
A tibble: 18 × 1
value
<chr>
代码
简称
日期
前收盘价(元)
开盘价(元)
最高价(元)
最低价(元)
收盘价(元)
成交量(股)
成交金额(元)
涨跌(元)
涨跌幅(%)
均价(元)
换手率(%)
A股流通市值(元)
总市值(元)
A股流通股本(股)
市盈率
62(异常值处理):输出所有换手率不是数字的行
#换手率这一列属性为chr,需要先强转数值型
#如果转换失败会变成NA,判断即可
df[is.na(as.numeric(df$`换手率(%)`)),]
Warning message in `[.tbl_df`(df, is.na(as.numeric(df$`换手率(%)`)), ):
"强制改变过程中产生了NA"
A tibble: 18 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><chr><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_2_1616.294616.294616.294616.294616.2946NA--00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1716.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1816.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1916.294616.294616.294616.294616.2946--NA00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2216.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2316.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2416.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2516.294616.294616.294616.294616.2946----00NA--3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2616.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2916.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_1 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_2 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_3 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_4 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_7 16.294616.294616.294616.294616.2946----00--NA3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_8 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_9 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_1016.294616.294616.294616.294616.2946NA--00----3.44e+113.44e+11186534714156.801
head(
    df %>% 
          mutate(`换手率(%)` = parse_number(`换手率(%)`)) %>% 
          filter(is.na(`换手率(%)`))
    )
Warning message:
"17 parsing failures.
row col expected actual
 27  -- a number     --
 28  -- a number     --
 29  -- a number     --
 30  -- a number     --
 31  -- a number     --
... ... ........ ......
See problems(...) for more details.
"
A tibble: 6 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><dbl><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_2_1616.294616.294616.294616.294616.2946NA--00--NA3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1716.294616.294616.294616.294616.2946----00--NA3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1816.294616.294616.294616.294616.2946----00--NA3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1916.294616.294616.294616.294616.2946--NA00--NA3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2216.294616.294616.294616.294616.2946----00--NA3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2316.294616.294616.294616.294616.2946----00--NA3.44e+113.44e+11186534714156.801
63(异常值处理):输出所有换手率为–的行
df %>% 
  filter(`换手率(%)` == "--")
A tibble: 17 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><chr><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_2_1616.294616.294616.294616.294616.2946NA--00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1716.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1816.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_1916.294616.294616.294616.294616.2946--NA00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2216.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2316.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2416.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2516.294616.294616.294616.294616.2946----00NA--3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2616.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_2_2916.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_1 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_2 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_3 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_4 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_8 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_9 16.294616.294616.294616.294616.2946----00----3.44e+113.44e+11186534714156.801
600000.SH浦发银行2016_3_1016.294616.294616.294616.294616.2946NA--00----3.44e+113.44e+11186534714156.801
64(数据操作):重置df的行号
rownames(df) = NULL    # R中无行号就是数字索引    
# 如果是tibble则索引始终是按顺序
65(异常值处理):删除所有换手率为非数字的行
df <- df %>% 
  mutate(`换手率(%)` = parse_number(`换手率(%)`)) %>% 
  filter(!is.na(`换手率(%)`))
Warning message:
"17 parsing failures.
row col expected actual
 27  -- a number     --
 28  -- a number     --
 29  -- a number     --
 30  -- a number     --
 31  -- a number     --
... ... ........ ......
See problems(...) for more details.
"
#df[!is.na(as.numeric(df$`换手率(%)`)),]
# 或者根据前几题的经验,非数字就是'--'
df <- df %>%
  filter(`换手率(%)` != '--')

为了便于后续处理,做批量数值型转化, 并转化为tsibble对象

# library(tsibble)
 
# df <- df %>% 
#   mutate_at(vars(4:18), as.numeric) %>% 
#   mutate(日期 = lubridate::as_date(日期)) %>%
#   as_tsibble(index = 日期, key = c(代码, 简称))

66(数据可视化):绘制换手率的密度曲线
png('./data_R/out/66_换手率的密度曲线.png')

df$`换手率(%)` <- as.double(df$`换手率(%)`)
ggplot(df) +
  geom_density(aes(`换手率(%)`))

dev.off()

png: 2

67(数据计算):计算前一天与后一天收盘价的差值

summarise和mutate函数都可以对一个数据框的某一列(而不是整个数据框)进行修改和汇总,两者的主要区别在于返回结果的方式不同,
其中summarise函数返回一个只包含修改或汇总后数据的数据框,
而mutate函数则返回一个由原始数据和修改或汇总后数据两部分构成的数据框

delta <- df %>%
  summarise(delta = `收盘价(元)` - lag(`收盘价(元)`))
head(delta)
A tibble: 6 × 1
delta
<dbl>
NA
0.1413
0.1237
-0.5211
-0.0177
-0.3886
delta <- df %>% 
  mutate(delta = `收盘价(元)` - lag(`收盘价(元)`)) %>% 
  select(日期, `收盘价(元)`, delta)
head(delta)
A tibble: 6 × 3
日期收盘价(元)delta
<chr><dbl><dbl>
2016_1_4 15.7205 NA
2016_1_5 15.8618 0.1413
2016_1_6 15.9855 0.1237
2016_1_7 15.4644-0.5211
2016_1_8 15.4467-0.0177
2016_1_1115.0581-0.3886
68(数据计算):计算前一天与后一天收盘价的变化率

lead()/lag()函数
通过这两个函数可以在一次查询中取出同一字段的前N行的数据(lag)和后N行的数据(lead)作为独立的列

pct_change <- df %>% summarise(pct_change = (`收盘价(元)` - lag(`收盘价(元)`))/lag(`收盘价(元)`))
head(pct_change)
A tibble: 6 × 1
pct_change
<dbl>
NA
0.008988264
0.007798610
-0.032598292
-0.001144564
-0.025157477
pct_change <- df %>% 
  mutate(change = (`收盘价(元)` - lag(`收盘价(元)`)) / `收盘价(元)`) %>% 
  select(日期, `收盘价(元)`, change)
head(pct_change)
A tibble: 6 × 3
日期收盘价(元)change
<chr><dbl><dbl>
2016_1_4 15.7205 NA
2016_1_5 15.8618 0.008908195
2016_1_6 15.9855 0.007738263
2016_1_7 15.4644-0.033696749
2016_1_8 15.4467-0.001145876
2016_1_1115.0581-0.025806709
69(数据操作):设置日期为行索引
head(
    df %>% column_to_rownames("日期")
)

A data.frame: 6 × 17
代码简称前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><dbl><dbl><dbl><dbl><dbl>
2016_1_4600000.SH浦发银行16.135616.144416.144415.499715.720542240610754425783 -0.4151-2.572517.8601999999999990.22643.32e+113.32e+11186534714156.5614
2016_1_5600000.SH浦发银行15.720515.464415.950115.367215.8618580547931034181474 0.1413 0.898917.8139 0.31123.35e+113.35e+11186534714156.6204
2016_1_6600000.SH浦发银行15.861815.808816.020815.623415.985546772653838667398 0.1236 0.779517.9307000000000020.25073.38e+113.38e+11186534714156.6720
2016_1_7600000.SH浦发银行15.985515.720515.808815.367215.464411350479199502702 -0.5211-3.259717.5765999999999990.06083.27e+113.27e+11186534714156.4545
2016_1_8600000.SH浦发银行15.464415.667515.791214.934515.4467719182961262105060-0.0177-0.114217.5491999999999990.38553.26e+113.26e+11186534714156.4471
2016_1_11600000.SH浦发银行15.446715.199415.411414.978615.0581901771351550155933-0.3886-2.515717.1901000000000010.48343.18e+113.18e+11186534714156.2849
70(数据计算):对收盘价做步长为5的滑动平均
library(slider)
head(
        df %>%
          mutate(avg_5 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, 
                                   .before = 2, .after = 2)) %>% 
          select(日期, `收盘价(元)`, avg_5)
)

A tibble: 6 × 3
日期收盘价(元)avg_5
<chr><dbl><dbl>
2016_1_4 15.720515.85593
2016_1_5 15.861815.75805
2016_1_6 15.985515.69578
2016_1_7 15.464415.56330
2016_1_8 15.446715.47322
2016_1_1115.058115.34780

以5个数据作为一个数据滑动窗口,在这个5个数据上取均值(收盘价)

# library(RcppRoll)
# df %>% transmute(avg_5 = roll_mean(`收盘价(元)`,n = 5,align="right",fill = NA))

P71-80

71(数据计算):对收盘价做步长为5的滑动求和
head(
        df %>%
          mutate(sum_5 = slide_dbl(`收盘价(元)`, sum, na.rm = TRUE, 
                                   .before = 2, .after = 2)) %>% 
          select(日期, `收盘价(元)`, sum_5)
)
A tibble: 6 × 3
日期收盘价(元)sum_5
<chr><dbl><dbl>
2016_1_4 15.720547.5678
2016_1_5 15.861863.0322
2016_1_6 15.985578.4789
2016_1_7 15.464477.8165
2016_1_8 15.446777.3661
2016_1_1115.058176.7390
# df %>%
#   transmute(sum_5 = roll_sum(`收盘价(元)`,n = 5,align="right",fill = NA))
72(数据可视化):将收盘价及其5日均线、20日均线绘制在同一个图上
#df$`收盘价(元)` <- as.numeric(df$`收盘价(元)`)
head(df)
A tibble: 6 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><dbl><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_1_4 16.135616.144416.144415.499715.720542240610754425783 -0.4151-2.572517.8601999999999990.22643.32e+113.32e+11186534714156.5614
600000.SH浦发银行2016_1_5 15.720515.464415.950115.367215.8618580547931034181474 0.1413 0.898917.8139 0.31123.35e+113.35e+11186534714156.6204
600000.SH浦发银行2016_1_6 15.861815.808816.020815.623415.985546772653838667398 0.1236 0.779517.9307000000000020.25073.38e+113.38e+11186534714156.6720
600000.SH浦发银行2016_1_7 15.985515.720515.808815.367215.464411350479199502702 -0.5211-3.259717.5765999999999990.06083.27e+113.27e+11186534714156.4545
600000.SH浦发银行2016_1_8 15.464415.667515.791214.934515.4467719182961262105060-0.0177-0.114217.5491999999999990.38553.26e+113.26e+11186534714156.4471
600000.SH浦发银行2016_1_1115.446715.199415.411414.978615.0581901771351550155933-0.3886-2.515717.1901000000000010.48343.18e+113.18e+11186534714156.2849
png('./data_R/out/72_收盘价及其均线1.png')

df %>%
  mutate(avg_5 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, .before = 2, .after = 2),
         avg_20 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, .before = 10, .after = 9)) %>% 
  pivot_longer(c(`收盘价(元)`, avg_5, avg_20),
                  names_to = "type",
                  values_to = "price") %>% 
  ggplot(aes(日期, price, color = type,group = 1)) +
    geom_line()
dev.off()

png: 2

# png('./data_R/out/72_收盘价及其均线2.png')
# df %>%
#   mutate(avg_5 = roll_mean(`收盘价(元)`,n = 5,align="right",fill = NA),
#          avg_20 = roll_mean(`收盘价(元)`,n = 20,align="right",fill = NA)) %>%
#   ggplot() +
#   geom_line(aes(日期,`收盘价(元)`,group = 1),color = 'steelblue',size = 1.2) +
#   geom_line(aes(日期,avg_5,group = 1),color = 'orange',size = 1.2) +
#   geom_line(aes(日期,avg_20,group = 1),color = 'green',size = 1.2)

# dev.off()
73(数据重采样):按周为采样规则,计算一周收盘价最大值
# weekmax = df %>% 
#   index_by(weeks = ~ yearweek(.)) %>%    #周度汇总
#   summarise(max_week = max(`收盘价(元)`, na.rm = TRUE))
# weekmax
# library(plyr)
 
# res <- dlply(df,.(cut(日期,"7")),"[")
# res_max <- sapply(res,function(n)max(n$`收盘价(元)`),simplify=TRUE)
# as.data.frame(res_max)
Error in cut.default(日期, "7"): 'x' 必需是数值
Traceback:


1. dlply(df, .(cut(日期, "7")), "[")

2. splitter_d(.data, .variables, drop = .drop)

3. eval.quoted(.variables, data)

4. lapply(exprs, eval, envir = envir, enclos = enclos)

5. FUN(X[[i]], ...)

6. FUN(X[[i]], ...)

7. cut(日期, "7")

8. cut.default(日期, "7")

9. stop("'x' must be numeric")
74(数据可视化):绘制重采样数据与原始数据
# png('./data_R/out/74_重采样数据与原始数据.png')
# ggplot() +
#   geom_line(data = weekmax, aes(weeks, max_week), color = "red") +
#   geom_line(data = df, aes(日期, `收盘价(元)`), color = "steelblue")  
# dev.off()
75(数据操作):将数据往后移动5天

这是批量做后移,单个变量做后移用mutate(var = lag(var, 5)即可。

head(
bind_cols(df[,1:3], map_dfc(df[,-(1:3)], lag, n = 5)), 10
)

A tibble: 10 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><dbl><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_1_4 NA NA NA NA NANA NA NA NANA NA NA NA NA NA
600000.SH浦发银行2016_1_5 NA NA NA NA NANA NA NA NANA NA NA NA NA NA
600000.SH浦发银行2016_1_6 NA NA NA NA NANA NA NA NANA NA NA NA NA NA
600000.SH浦发银行2016_1_7 NA NA NA NA NANA NA NA NANA NA NA NA NA NA
600000.SH浦发银行2016_1_8 NA NA NA NA NANA NA NA NANA NA NA NA NA NA
600000.SH浦发银行2016_1_1116.135616.144416.144415.499715.720542240610754425783 -0.4151-2.572517.8601999999999990.22643.32e+113.32e+11186534714156.5614
600000.SH浦发银行2016_1_1215.720515.464415.950115.367215.8618580547931034181474 0.1413 0.898917.8139 0.31123.35e+113.35e+11186534714156.6204
600000.SH浦发银行2016_1_1315.861815.808816.020815.623415.985546772653838667398 0.1236 0.779517.9307000000000020.25073.38e+113.38e+11186534714156.6720
600000.SH浦发银行2016_1_1415.985515.720515.808815.367215.464411350479199502702 -0.5211-3.259717.5765999999999990.06083.27e+113.27e+11186534714156.4545
600000.SH浦发银行2016_1_1515.464415.667515.791214.934515.4467719182961262105060-0.0177-0.114217.5491999999999990.38553.26e+113.26e+11186534714156.4471
76(数据操作):将数据往前移动5天
head(
bind_cols(df[,1:3], map_dfc(df[,-(1:3)], lead, n = 5)), 5
)
A tibble: 5 × 18
代码简称日期前收盘价(元)开盘价(元)最高价(元)最低价(元)收盘价(元)成交量(股)成交金额(元)涨跌(元)涨跌幅(%)均价(元)换手率(%)A股流通市值(元)总市值(元)A股流通股本(股)市盈率
<chr><chr><chr><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl><dbl><chr><dbl><dbl><dbl><dbl><dbl>
600000.SH浦发银行2016_1_415.446715.199415.411414.978615.0581901771351550155933-0.3886-2.515717.1901000000000010.48343.18e+113.18e+11186534714156.2849
600000.SH浦发银行2016_1_515.058115.164115.473215.084615.411455374454964061502 0.3533 2.346017.4099 0.29693.26e+113.26e+11186534714156.4324
600000.SH浦发银行2016_1_615.411415.517415.808815.323115.358447869312843717365 -0.0530-0.343817.6253999999999990.25663.24e+113.24e+11186534714156.4102
600000.SH浦发银行2016_1_715.358415.014015.888314.916815.826554838833966117848 0.4681 3.047717.6174 0.29403.34e+113.34e+11186534714156.6056
600000.SH浦发银行2016_1_815.826515.720516.029615.473215.526246723139836146426 -0.3003-1.897317.8958000000000010.25053.28e+113.28e+11186534714156.4803
77(数据操作):计算开盘价的累积平均
rlt = df %>% 
  mutate(累积平均 = cummean(`开盘价(元)`)) %>% 
  select(日期, `开盘价(元)`, 累积平均)
head(rlt)
A tibble: 6 × 3
日期开盘价(元)累积平均
<chr><dbl><dbl>
2016_1_4 16.144416.14440
2016_1_5 15.464415.80440
2016_1_6 15.808815.80587
2016_1_7 15.720515.78453
2016_1_8 15.667515.76112
2016_1_1115.199415.66750
78(数据计算):绘制开盘价的累积平均与原始数据的折线图

height, widthres 分别为绘图框高、宽和图的分辨率

png('./data_R/out/78_开盘价的累积平均与原始数据的折线图1.png', height=480,width=700)
rlt %>% 
  pivot_longer(-日期, names_to = "type", values_to = "price") %>% 
  ggplot(aes(日期, price, color = type, group = 1)) +
    geom_line()

dev.off()

png: 2

png('./data_R/out/78_开盘价的累积平均与原始数据的折线图2.png', height=480,width=700)
res <- df %>%
  transmute(cummean = cumsum(`开盘价(元)`)/1:dim(df)[1])

df %>%
  cbind(res) %>%
  dplyr::rename(Opening_Price = `开盘价(元)`,
         Expanding_Open_Mean = cummean) %>%
  select(日期,Opening_Price,Expanding_Open_Mean) %>%
  pivot_longer(c(Opening_Price,Expanding_Open_Mean),
               names_to = 'type',
               values_to ='price') %>%
  ggplot(aes(日期,price,color = type, group=1)) +
  geom_line(size=1.2) +
  scale_color_manual(values=c('orange','steelblue')) +
  theme_bw() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.title = element_blank(),
    legend.position = c(0.9, 0.9)
  )

dev.off()

png: 2

79(数据计算):计算布林指标
boll = df %>%
  mutate(avg_20 = slide_dbl(`收盘价(元)`, mean, na.rm = TRUE, .before = 10, .after = 9), 
         sd_20 = slide_dbl(`收盘价(元)`, sd, na.rm = TRUE, .before = 10, .after = 9),
         up = avg_20 + 2 * sd_20,
         down = avg_20 - 2 * sd_20) %>% 
  select(日期, `收盘价(元)`, avg_20, up, down)

 boll %>% sample_n(10)
A tibble: 10 × 5
日期收盘价(元)avg_20updown
<chr><dbl><dbl><dbl><dbl>
2017_2_2316.690016.6515017.0060416.29696
2016_5_1615.270115.5447616.0134015.07612
2017_2_1716.640016.7040016.9452616.46274
2016_5_2615.632215.6432416.2239215.06256
2016_8_2516.340016.4415016.7912516.09175
2017_4_1415.510015.5650016.4400714.68993
2017_4_1115.890015.7000016.5202714.87973
2016_5_3015.853015.6838716.2132915.15445
2017_1_2616.740016.6115016.9776016.24540
2016_6_1 16.065015.7280316.1683815.28767
80(数据可视化):绘制布林曲线
png('./data_R/out/80_布林曲线.png', height=480,width=2000)
boll %>% 
  pivot_longer(-日期, names_to = "type", values_to = "price") %>% 
  ggplot(aes(日期, price, color = type,group=1)) + 
    geom_line()
dev.off()

png: 2

P81-90

81(加载查看包):加载并查看tidyverse包版本
packageVersion("tidyverse")
[1] '1.3.1'
packageVersion("dplyr")
[1] '1.0.8'
82(生成随机数):生成20个0~100的随机数,创建数据框

sample.int() 随机抽样函数

set.seed(123)                # 保证结果出现
df1 = tibble(nums = sample.int(100, 20))
df1
A tibble: 20 × 1
nums
<int>
31
79
51
14
67
42
50
43
97
25
90
69
57
9
72
26
7
95
87
36
83(生成等差数):生成20个0~100固定步长的数,创建数据框

seq()函数是产生等距间隔数列的函数
seq(frme,to,by)
by是步长

df2 = tibble(nums = seq(0, 99, by = 5))
df2
A tibble: 20 × 1
nums
<dbl>
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
84(生成指定分布随机数):生成20个标准正态分布的随机数,创建数据框

R语言中rnorm函数 rnorm(n, mean = 0, sd = 1) n 为产生随机值个数(长度),mean 是平均数, sd 是标准差

set.seed(111)    #set.seed()函数是为了保证你随机生成的随机数前后一致
df3 = tibble(nums = rnorm(20, 0, 1))
df3
A tibble: 20 × 1
nums
<dbl>
0.23522071
-0.33073587
-0.31162382
-2.30234566
-0.17087604
0.14027823
-1.49742666
-1.01018842
-0.94847560
-0.49396222
-0.17367413
-0.40659878
1.84563626
0.39405411
0.79752850
-1.56666536
-0.08585101
-0.35913948
-1.19360897
0.36418674
85(合并数据):将df1, df2, df3按行合并为新数据框
#bind_rows(df1, df2, df3)
rbind(df1, df2, df3)
86(合并数据):将df1, df2, df3按列合并为新数据框
df = cbind(df1, df2, df3)
names(df) <- c(0,1,2)
head(df)
A data.frame: 6 × 3
012
<int><dbl><dbl>
131 0 0.2352207
279 5-0.3307359
35110-0.3116238
41415-2.3023457
56720-0.1708760
64225 0.1402782
87(查看数据):查看df所有数据的最小值、25%分位数、中位数、75%分位数、最大值
summary(unlist(df))
# unlist(df) %>% summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-2.3024  0.1052 25.5000 33.1654 61.2500 97.0000 
88(修改列名):修改列名为col1, col2, col3

若只修改个别列名,用rename(newname=oldname).

names(df) <- c('col1','col2','col3')
head(df)
A data.frame: 6 × 3
col1col2col3
<int><dbl><dbl>
131 0 0.2352207
279 5-0.3307359
35110-0.3116238
41415-2.3023457
56720-0.1708760
64225 0.1402782
89(数据操作):提取在第1列中而不在第2列中的数
setdiff(df$col1, df$col2)
  1. 31
  2. 79
  3. 51
  4. 14
  5. 67
  6. 42
  7. 43
  8. 97
  9. 69
  10. 57
  11. 9
  12. 72
  13. 26
  14. 7
  15. 87
  16. 36
90(数据操作):提取在第1列和第2列出现频率最高的三个数字
count(unlist(c(df$col1,df$col2))) %>%
  arrange(desc(freq)) %>%
  filter(row_number() <= 3)
A data.frame: 3 × 2
xfreq
<dbl><int>
252
502
902

P91-100

91(数据操作):提取第1列可以整除5的数的位置
which(df$col1 %% 5 == 0)
  1. 7
  2. 10
  3. 11
  4. 18
92(数据计算):计算第1列的1阶差分
df %>% mutate(diff1 = col1 - lag(col1))   #若只是要数值,用diff(df$col1)即可
A data.frame: 20 × 4
col1col2col3diff1
<int><dbl><dbl><int>
31 0 0.23522071 NA
79 5-0.33073587 48
5110-0.31162382-28
1415-2.30234566-37
6720-0.17087604 53
4225 0.14027823-25
5030-1.49742666 8
4335-1.01018842 -7
9740-0.94847560 54
2545-0.49396222-72
9050-0.17367413 65
6955-0.40659878-21
5760 1.84563626-12
965 0.39405411-48
7270 0.79752850 63
2675-1.56666536-46
780-0.08585101-19
9585-0.35913948 88
8790-1.19360897 -8
3695 0.36418674-51
93(数据操作):将col1, col2, col3三列顺序颠倒
head( 
    df %>% select(rev(names(df)))
)
A data.frame: 6 × 3
col3col2col1
<dbl><dbl><int>
1 0.2352207 031
2-0.3307359 579
3-0.31162381051
4-2.30234571514
5-0.17087602067
6 0.14027822542
head(
df %>% select(col3,col2,everything())
)
A data.frame: 6 × 3
col3col2col1
<dbl><dbl><int>
1 0.2352207 031
2-0.3307359 579
3-0.31162381051
4-2.30234571514
5-0.17087602067
6 0.14027822542
94(数据操作):提取第一列位置在1,10,15的数
df[c(1,10,15),1]
  1. 31
  2. 25
  3. 72
95(数据操作):查找第一列的局部最大值位置
rlt = df %>% 
  mutate(diff = sign(col1 - lag(col1)) + sign(col1 - lead(col1))) 
 
which(rlt$diff == 2)
  1. 2
  2. 5
  3. 7
  4. 9
  5. 11
  6. 15
  7. 18
96(数据计算):按行计算df每一行的均值

rowMeans() 函数

rowMeans(df)    # 或者 apply(df, 1, mean)
  1. 10.4117402372046
  2. 27.8897547094578
  3. 20.2294587253404
  4. 8.89921811385688
  5. 28.943041318462
  6. 22.3800927417011
  7. 26.167524448145
  8. 25.6632705269829
  9. 45.3505081316825
  10. 23.1686792609216
  11. 46.6087752906765
  12. 41.1978004066531
  13. 39.6152120879357
  14. 24.7980180366582
  15. 47.5991761670872
  16. 33.1444448799381
  17. 28.9713829970591
  18. 59.8802868395874
  19. 58.6021303444788
  20. 43.7880622456741
97(数据计算):对第二列计算步长为3的移动平均值
library(RcppRoll)
 
df %>%
  summarise(avg_3 = roll_mean(col2, n=3))
A data.frame: 18 × 1
avg_3
<dbl>
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
98(数据计算):按第三列值的大小升序排列
df %>% arrange(col3)
A data.frame: 20 × 3
col1col2col3
<int><dbl><dbl>
1415-2.30234566
2675-1.56666536
5030-1.49742666
8790-1.19360897
4335-1.01018842
9740-0.94847560
2545-0.49396222
6955-0.40659878
9585-0.35913948
79 5-0.33073587
5110-0.31162382
9050-0.17367413
6720-0.17087604
780-0.08585101
4225 0.14027823
31 0 0.23522071
3695 0.36418674
965 0.39405411
7270 0.79752850
5760 1.84563626
99(数据操作):按第一列大于50的数修改为"高"
df[df$col1 > 50,1] <- '高'
df
A data.frame: 20 × 3
col1col2col3
<chr><dbl><dbl>
31 0 0.23522071
5-0.33073587
10-0.31162382
1415-2.30234566
20-0.17087604
4225 0.14027823
5030-1.49742666
4335-1.01018842
40-0.94847560
2545-0.49396222
50-0.17367413
55-0.40659878
60 1.84563626
9 65 0.39405411
70 0.79752850
2675-1.56666536
7 80-0.08585101
85-0.35913948
90-1.19360897
3695 0.36418674
100(数据计算):计算第一列与第二列的欧氏距离

R语言中dist()daisy()函数 常用用来计算x矩阵行向量之间的距离

dist(t(df[,1:2]))
# dist(rbind(df$col1,df$col2))
Warning message in dist(t(df[, 1:2])):
"强制改变过程中产生了NA"



         col1
col2 181.4497

P101-110

101(数据读取):从csv文件中读取指定数据:读取前10行, positionName和salary列

注1:该数据是GBK编码,为避免中文乱码,GBK编码的csv或txt用read.csv()读取;UTF-8编码的csv或txt用readr::read_csv()读取;若用read_csv()读取GBK编码文件,需要设置编码(见题目110)。

注2:R中常规读取数据不能在读取时选择列,采用读取之后选择列

read.csv("./data_R/数据1_101-120涉及.csv", nrows = 10) %>% 
  select(positionName, salary)
A data.frame: 10 × 2
positionNamesalary
<chr><int>
数据分析 37500
数据建模 15000
数据分析 3500
数据分析 45000
数据分析 30000
数据分析 50000
数据分析 30000
数据建模工程师35000
数据分析专家 60000
数据分析师 40000
102(数据读取):从csv文件中读取数据,将薪资大于10000的改为"高"
df = read.csv("./data_R/数据2_101-120涉及.csv") %>% 
  mutate(薪资水平 = if_else(薪资水平 > 10000, "高", "低"))
head(df)

# library(readr)
 
# df2 <- read.csv('./data_R/数据2_101-120涉及.csv') %>%
#   mutate('学历要求',
#          '薪资水平' = ifelse(
#            薪资水平 > 10000,'高','低'))
A data.frame: 6 × 2
学历要求薪资水平
<chr><chr>
1本科
2硕士
3本科
4本科
5不限
6硕士
103(数据操作):从df中对薪资水平每隔20行进行抽样
df[seq(1,dim(df)[1],20),]
df %>% 
  slice(seq(1, n(), by = 20))  
# 或者用df[seq(1, nrow(df), 20),]
104(数据操作):取消使用科学记数法
set.seed(123)
df = tibble(data = runif(10) ^ 10) %>% 
  round(3)
df
A tibble: 10 × 1
data
<dbl>
0.000
0.093
0.000
0.288
0.541
0.000
0.002
0.320
0.003
0.000
105(数据操作):将上一题的数据转换为百分数

mutate()函数增加新列并保留已存在的列

scales::percent()将数字类型转换为百分数

df %>% 
  mutate(data = scales::percent(data, 0.01))
A tibble: 10 × 1
data
<chr>
0.00%
9.30%
0.00%
28.80%
54.10%
0.00%
0.20%
32.00%
0.30%
0.00%
106(数据操作):查找上一题数据中第3大值的行号

sort(x) 是对向量x进行排序,返回值排序后的数值向量。
rank() 是求秩的函数,它的返回值是这个向量中对应元素的“排名”。
order() 的返回值是指出排序后的向量中各元素在原向量中的索引,默认升序。

order(df$data, decreasing = TRUE)[3]

4

107(数据操作):反转df的行
df %>% 
  slice(rev(1:n()))
# 或者 df[rev(1:nrow(df)),]
A tibble: 10 × 1
data
<dbl>
0.000
0.003
0.320
0.002
0.000
0.541
0.288
0.000
0.093
0.000
108(数据连接:全连接):根据多列匹配合并数据,保留df1和df2的观测
df1 <- tibble(
  key1 = c("K0","K0","K1","K2"),
  key2 = c("K0","K1","K0","K1"),
  A = str_c('A', 0:3),
  B = str_c('B', 0:3)
)
 
df2 <- tibble(
  key1 = c("K0","K1","K1","K2"),
  key2 = str_c("K", rep(0,4)),
  C = str_c('C', 0:3),
  D = str_c('D', 0:3)
)
df1
A tibble: 4 × 4
key1key2AB
<chr><chr><chr><chr>
K0K0A0B0
K0K1A1B1
K1K0A2B2
K2K1A3B3
df2
A tibble: 4 × 4
key1key2CD
<chr><chr><chr><chr>
K0K0C0D0
K1K0C1D1
K1K0C2D2
K2K0C3D3
df1 %>% 
  full_join(df2, by = c("key1", "key2"))
A tibble: 6 × 6
key1key2ABCD
<chr><chr><chr><chr><chr><chr>
K0K0A0B0C0D0
K0K1A1B1NANA
K1K0A2B2C1D1
K1K0A2B2C2D2
K2K1A3B3NANA
K2K0NANAC3D3

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UcoCsdKg-1653619964755)(attachment:image.png)]

109(数据连接:左连接):根据多列匹配合并数据,只保留df1的观测
df1 %>% 
  left_join(df2, by = c("key1", "key2"))
A tibble: 5 × 6
key1key2ABCD
<chr><chr><chr><chr><chr><chr>
K0K0A0B0C0D0
K0K1A1B1NANA
K1K0A2B2C1D1
K1K0A2B2C2D2
K2K1A3B3NANA

dplyr包还提供了右连接right_join(),内连接:inner_join(),以及用于过滤的连接:半连接:semi_join(),反连接:anti_join().

110(数据处理):再次读取数据1并显示所有列

str函数glimpse都可以方便我们查看数据框内的变量细节

df <- read.csv("./data_R/数据1_101-120涉及.csv") %>%
      glimpse()
Rows: 105
Columns: 53
$ positionId            [3m[90m<int>[39m[23m 6802721, 5204912, 6877668, 6496141, 6467417, 688~
$ positionName          [3m[90m<chr>[39m[23m "数据分析", "数据建模", "数据分析", "数据分析", ~
$ companyId             [3m[90m<int>[39m[23m 475770, 50735, 100125, 26564, 29211, 94826, 3487~
$ companyLogo           [3m[90m<chr>[39m[23m "i/image2/M01/B7/3E/CgoB5lwPfEaAdn8WAABWQ0Jgl5s3~
$ companySize           [3m[90m<chr>[39m[23m "50-150人", "150-500人", "2000人以上", "500-2000~
$ industryField         [3m[90m<chr>[39m[23m "移动互联网,电商", "电商", "移动互联网,企业服务"~
$ financeStage          [3m[90m<chr>[39m[23m "A轮", "B轮", "上市公司", "D轮及以上", "上市公司~
$ companyLabelList      [3m[90m<chr>[39m[23m "['绩效奖金', '带薪年假', '定期体检', '弹性工作'~
$ firstType             [3m[90m<chr>[39m[23m "产品|需求|项目类", "开发|测试|运维类", "产品|需~
$ secondType            [3m[90m<chr>[39m[23m "数据分析", "数据开发", "数据分析", "数据开发", ~
$ thirdType             [3m[90m<chr>[39m[23m "数据分析", "建模", "数据分析", "数据分析", "数~
$ skillLables           [3m[90m<chr>[39m[23m "['SQL', '数据库', '数据运营', 'BI']", "['算法',~
$ positionLables        [3m[90m<chr>[39m[23m "['电商', '社交', 'SQL', '数据库', '数据运营', '~
$ industryLables        [3m[90m<chr>[39m[23m "['电商', '社交', 'SQL', '数据库', '数据运营', '~
$ createTime            [3m[90m<chr>[39m[23m "2020/3/16 11:00", "2020/3/16 11:08", "2020/3/16~
$ formatCreateTime      [3m[90m<chr>[39m[23m "11:00发布", "11:08发布", "10:33发布", "10:10发~
$ district              [3m[90m<chr>[39m[23m "余杭区", "滨江区", "江干区", "江干区", "余杭区"~
$ businessZones         [3m[90m<chr>[39m[23m "['仓前']", "['西兴', '长河']", "['四季青', '钱~
$ salary                [3m[90m<int>[39m[23m 37500, 15000, 3500, 45000, 30000, 50000, 30000, ~
$ workYear              [3m[90m<chr>[39m[23m "1-3年", "3-5年", "1-3年", "3-5年", "3-5年", "1-~
$ jobNature             [3m[90m<chr>[39m[23m "全职", "全职", "全职", "全职", "全职", "全职", ~
$ education             [3m[90m<chr>[39m[23m "本科", "本科", "本科", "本科", "大专", "本科", ~
$ positionAdvantage     [3m[90m<chr>[39m[23m "五险一金、弹性工作、带薪年假、年度体检", "六险~
$ imState               [3m[90m<chr>[39m[23m "today", "disabled", "today", "threeDays", "disa~
$ lastLogin             [3m[90m<chr>[39m[23m "2020/3/16 11:00", "2020/3/16 11:08", "2020/3/16~
$ publisherId           [3m[90m<int>[39m[23m 12022406, 5491688, 5322583, 9814560, 6392394, 11~
$ approve               [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ subwayline            [3m[90m<chr>[39m[23m "", "", "4号线", "1号线", "", "", "", "2号线", "~
$ stationname           [3m[90m<chr>[39m[23m "", "", "江锦路", "文泽路", "", "", "", "丰潭路"~
$ linestaion            [3m[90m<chr>[39m[23m "", "", "4号线_城星路;4号线_市民中心;4号线_江锦~
$ latitude              [3m[90m<dbl>[39m[23m 30.27842, 30.18804, 30.24152, 30.29940, 30.28295~
$ longitude             [3m[90m<dbl>[39m[23m 120.0059, 120.2012, 120.2125, 120.3503, 120.0098~
$ hitags                [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", ~
$ resumeProcessRate     [3m[90m<int>[39m[23m 50, 23, 11, 100, 20, 16, 100, 1, 83, 1, 83, 0, 1~
$ resumeProcessDay      [3m[90m<int>[39m[23m 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ~
$ score                 [3m[90m<int>[39m[23m 233, 176, 80, 68, 66, 66, 65, 47, 24, 18, 17, 17~
$ newScore              [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ matchScore            [3m[90m<dbl>[39m[23m 15.1018750, 32.5594140, 14.9723570, 12.8741530, ~
$ matchScoreExplain     [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ query                 [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ explain               [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ isSchoolJob           [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ~
$ adWord                [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ plus                  [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ pcShow                [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ appShow               [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ deliver               [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ gradeDescription      [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ promotionScoreExplain [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ isHotHire             [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ count                 [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ aggregatePositionIds  [3m[90m<chr>[39m[23m "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", ~
$ famousCompany         [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, T~

P111-120

111(数据操作):查找secondType与thirdType值相等的行号
which(df$secondType == df$thirdType)
  1. 1
  2. 3
  3. 5
  4. 6
  5. 7
  6. 11
  7. 15
  8. 24
  9. 26
  10. 28
  11. 29
  12. 30
  13. 31
  14. 34
  15. 38
  16. 39
  17. 40
  18. 41
  19. 42
  20. 49
  21. 50
  22. 53
  23. 54
  24. 56
  25. 58
  26. 62
  27. 66
  28. 67
  29. 68
  30. 72
  31. 74
  32. 75
  33. 76
  34. 80
  35. 81
  36. 83
  37. 86
  38. 89
  39. 90
  40. 92
  41. 97
  42. 101
112(数据操作):查找薪资大于平均薪资的第三个数据
df %>% 
  filter(salary > mean(salary)) %>% 
  slice(3)
A data.frame: 1 × 53
positionIdpositionNamecompanyIdcompanyLogocompanySizeindustryFieldfinanceStagecompanyLabelListfirstTypesecondType...pluspcShowappShowdelivergradeDescriptionpromotionScoreExplainisHotHirecountaggregatePositionIdsfamousCompany
<int><chr><int><chr><chr><chr><chr><chr><chr><chr>...<lgl><int><int><int><lgl><lgl><int><int><chr><lgl>
6882347数据分析94826image2/M00/04/12/CgpzWlXwKIKAaJd9AAARGoomCu4858.jpg?cc=0.677638628985732850-150人移动互联网,社交B轮['股票期权', '扁平管理', '五险一金', '岗位晋升']产品|需求|项目类数据分析...NA000NANA00[]FALSE
113(数据操作):将上一题数据的salary列开根号
head(
df %>% 
  mutate(salary_sqrt = sqrt(salary)) %>% 
  select(salary, salary_sqrt)
)

A data.frame: 6 × 2
salarysalary_sqrt
<int><dbl>
137500193.6492
215000122.4745
3 3500 59.1608
445000212.1320
530000173.2051
650000223.6068
114(数据操作):将上一题数据的linestation列按_拆分
head(

df %>% 
  separate(linestaion, into = c("line", "station"), sep = "_", remove = FALSE) %>% 
  select(linestaion, line, station)
)
Warning message:
"Expected 2 pieces. Additional pieces discarded in 41 rows [3, 8, 10, 12, 13, 20, 21, 22, 23, 27, 29, 30, 31, 32, 34, 36, 37, 38, 41, 44, ...]."
Warning message:
"Expected 2 pieces. Missing pieces filled with `NA` in 60 rows [1, 2, 5, 6, 7, 9, 11, 14, 15, 16, 17, 18, 19, 24, 25, 26, 28, 33, 35, 39, ...]."
A data.frame: 6 × 3
linestaionlinestation
<chr><chr><chr>
1 NA
2 NA
34号线_城星路;4号线_市民中心;4号线_江锦路4号线城星路;4号线
41号线_文泽路 1号线文泽路
5 NA
6 NA
head(
df %>%
  mutate(split = str_split(linestaion,'_'))
)
A data.frame: 6 × 54
positionIdpositionNamecompanyIdcompanyLogocompanySizeindustryFieldfinanceStagecompanyLabelListfirstTypesecondType...pcShowappShowdelivergradeDescriptionpromotionScoreExplainisHotHirecountaggregatePositionIdsfamousCompanysplit
<int><chr><int><chr><chr><chr><chr><chr><chr><chr>...<int><int><int><lgl><lgl><int><int><chr><lgl><list>
16802721数据分析475770i/image2/M01/B7/3E/CgoB5lwPfEaAdn8WAABWQ0Jgl5s31.jpeg 50-150人 移动互联网,电商 A轮 ['绩效奖金', '带薪年假', '定期体检', '弹性工作'] 产品|需求|项目类数据分析...000NANA00[]FALSE
25204912数据建模 50735image1/M00/00/85/CgYXBlTUXeeAR0IjAABbroUk-dw977.png 150-500人 电商 B轮 ['年终奖金', '做五休二', '六险一金', '子女福利'] 开发|测试|运维类数据开发...000NANA00[]FALSE
36877668数据分析100125image2/M00/0C/57/CgqLKVYcOA2ADcFuAAAE8MukIKA745.png?cc=0.80892901937477292000人以上移动互联网,企业服务上市公司 ['节日礼物', '年底双薪', '股票期权', '带薪年假'] 产品|需求|项目类数据分析...000NANA00[]FALSE4号线 , 城星路;4号线 , 市民中心;4号线, 江锦路
46496141数据分析 26564i/image2/M01/F7/3F/CgoB5lyGAQGAZeI-AAAdOqXecnw673.jpg 500-2000人电商 D轮及以上['生日趴', '每月腐败基金', '每月补贴', '年度旅游']开发|测试|运维类数据开发...000NANA00[] TRUE1号线 , 文泽路
56467417数据分析 29211i/image2/M01/77/B8/CgoB5l1WDyGATNP5AAAlY3h88SY623.png 2000人以上物流丨运输 上市公司 ['技能培训', '免费班车', '专项奖金', '岗位晋升'] 产品|需求|项目类数据分析...000NANA00[] TRUE
66882347数据分析 94826image2/M00/04/12/CgpzWlXwKIKAaJd9AAARGoomCu4858.jpg?cc=0.677638628985732850-150人 移动互联网,社交 B轮 ['股票期权', '扁平管理', '五险一金', '岗位晋升'] 产品|需求|项目类数据分析...000NANA00[]FALSE
115(数据查看):查看上一题数据一共有多少列
ncol(df)

54

116(数据操作):提取industryField列以"数据"开头的行
head(
df[grep("^数据", df$industryField),]
)
A data.frame: 6 × 54
positionIdpositionNamecompanyIdcompanyLogocompanySizeindustryFieldfinanceStagecompanyLabelListfirstTypesecondType...pcShowappShowdelivergradeDescriptionpromotionScoreExplainisHotHirecountaggregatePositionIdsfamousCompanysplit
<int><chr><int><chr><chr><chr><chr><chr><chr><chr>...<int><int><int><lgl><lgl><int><int><chr><lgl><list>
96458372数据分析专家 34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']产品|需求|项目类数据分析...000NANA00[]FALSE
116804629数据分析师 34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']产品|需求|项目类数据分析...000NANA00[]FALSE
146804489资深数据分析师34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']开发|测试|运维类数据开发...000NANA00[]FALSE
226267370数据分析专家 31544image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png 150-500人数据服务 不需要融资['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升'] 开发|测试|运维类数据开发...000NANA00[]FALSE4号线 , 中医药大学;4号线, 联庄
336804489资深数据分析师34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']开发|测试|运维类数据开发...000NANA00[]FALSE
386242470数据分析师 31544image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png 150-500人数据服务 不需要融资['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升'] 产品|需求|项目类数据分析...000NANA00[]FALSE4号线 , 中医药大学;4号线, 联庄
head(
df %>% 
  filter(str_detect(industryField, "^数据"))
)
A data.frame: 6 × 54
positionIdpositionNamecompanyIdcompanyLogocompanySizeindustryFieldfinanceStagecompanyLabelListfirstTypesecondType...pcShowappShowdelivergradeDescriptionpromotionScoreExplainisHotHirecountaggregatePositionIdsfamousCompanysplit
<int><chr><int><chr><chr><chr><chr><chr><chr><chr>...<int><int><int><lgl><lgl><int><int><chr><lgl><list>
16458372数据分析专家 34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']产品|需求|项目类数据分析...000NANA00[]FALSE
26804629数据分析师 34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']产品|需求|项目类数据分析...000NANA00[]FALSE
36804489资深数据分析师34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']开发|测试|运维类数据开发...000NANA00[]FALSE
46267370数据分析专家 31544image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png 150-500人数据服务 不需要融资['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升'] 开发|测试|运维类数据开发...000NANA00[]FALSE4号线 , 中医药大学;4号线, 联庄
56804489资深数据分析师34132i/image2/M01/F8/DE/CgoB5lyHTJeAP7v9AAFXUt4zJo4515.png150-500人数据服务,广告营销A轮 ['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']开发|测试|运维类数据开发...000NANA00[]FALSE
66242470数据分析师 31544image1/M00/00/48/CgYXBlTUXOaADKooAABjQoD_n1w508.png 150-500人数据服务 不需要融资['专业红娘牵线', '节日礼物', '技能培训', '岗位晋升'] 产品|需求|项目类数据分析...000NANA00[]FALSE4号线 , 中医药大学;4号线, 联庄

117(数据分组汇总):以salary score和positionID做数据透视表
df %>% 
  group_by(positionId) %>% 
  summarise(salary_avg = mean(salary),
            score_avg = mean(score))
A data.frame: 1 × 2
salary_avgscore_avg
<dbl><dbl>
31723.8112.71429
df1 <- df %>%
  group_by(positionId) %>%
  dplyr::summarise(salary = mean(salary),
            score = mean(score)) %>%
  as.data.frame(.)
rownames(df) <- NULL
head(
tibble::column_to_rownames(df1,var='positionId')
)
A data.frame: 6 × 2
salaryscore
<dbl><dbl>
520305430000 4
520491215000176
526900237500 1
545369130000 4
551996237500 14
552062330000 6
118(数据分组汇总):同时对salary、score两列进行汇总计算
df1 %>% 
  summarise_at(vars(salary, score), list(~sum(.), ~mean(.), ~min(.)))
A data.frame: 1 × 6
salary_sumscore_sumsalary_meanscore_meansalary_minscore_min
<dbl><dbl><dbl><dbl><dbl><dbl>
30097501275.531681.5813.4263235000
119(数据分组汇总):同时对不同列进行不同的汇总计算:对salary求平均,对score求和
df1 %>% 
  summarise(salary_avg = mean(salary),
            score_sum = sum(score))
#注:若要分组再这样汇总,前面加上group_by(grpvar)即可。
A data.frame: 1 × 2
salary_avgscore_sum
<dbl><dbl>
31681.581275.5
120(数据分组汇总):计算并提取平均薪资最高的区
head(df)
A data.frame: 6 × 53
positionIdpositionNamecompanyIdcompanyLogocompanySizeindustryFieldfinanceStagecompanyLabelListfirstTypesecondType...pluspcShowappShowdelivergradeDescriptionpromotionScoreExplainisHotHirecountaggregatePositionIdsfamousCompany
<int><chr><int><chr><chr><chr><chr><chr><chr><chr>...<lgl><int><int><int><lgl><lgl><int><int><chr><lgl>
16802721数据分析475770i/image2/M01/B7/3E/CgoB5lwPfEaAdn8WAABWQ0Jgl5s31.jpeg 50-150人 移动互联网,电商 A轮 ['绩效奖金', '带薪年假', '定期体检', '弹性工作'] 产品|需求|项目类数据分析...NA000NANA00[]FALSE
25204912数据建模 50735image1/M00/00/85/CgYXBlTUXeeAR0IjAABbroUk-dw977.png 150-500人 电商 B轮 ['年终奖金', '做五休二', '六险一金', '子女福利'] 开发|测试|运维类数据开发...NA000NANA00[]FALSE
36877668数据分析100125image2/M00/0C/57/CgqLKVYcOA2ADcFuAAAE8MukIKA745.png?cc=0.80892901937477292000人以上移动互联网,企业服务上市公司 ['节日礼物', '年底双薪', '股票期权', '带薪年假'] 产品|需求|项目类数据分析...NA000NANA00[]FALSE
46496141数据分析 26564i/image2/M01/F7/3F/CgoB5lyGAQGAZeI-AAAdOqXecnw673.jpg 500-2000人电商 D轮及以上['生日趴', '每月腐败基金', '每月补贴', '年度旅游']开发|测试|运维类数据开发...NA000NANA00[] TRUE
56467417数据分析 29211i/image2/M01/77/B8/CgoB5l1WDyGATNP5AAAlY3h88SY623.png 2000人以上物流丨运输 上市公司 ['技能培训', '免费班车', '专项奖金', '岗位晋升'] 产品|需求|项目类数据分析...NA000NANA00[] TRUE
66882347数据分析 94826image2/M00/04/12/CgpzWlXwKIKAaJd9AAARGoomCu4858.jpg?cc=0.677638628985732850-150人 移动互联网,社交 B轮 ['股票期权', '扁平管理', '五险一金', '岗位晋升'] 产品|需求|项目类数据分析...NA000NANA00[]FALSE

df2 <- data.frame(df[,'district'],df[,'salary'])
names(df2) <- c("district",'salary')
head(df2)
A data.frame: 6 × 2
districtsalary
<chr><int>
1余杭区37500
2滨江区15000
3江干区 3500
4江干区45000
5余杭区30000
6余杭区50000
df2 %>% 
  group_by(district) %>% 
  summarise(salary_avg = mean(salary)) %>% 
  top_n(1, salary_avg)
A data.frame: 1 × 1
salary_avg
<dbl>
31723.81
df %>%
  group_by(district) %>%
  summarise(avg = mean(salary)) %>%
  arrange(desc(avg)) %>%
  filter(row_number() == 1)
A data.frame: 1 × 1
avg
<dbl>
31723.81
  • 6
    点赞
  • 76
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值