目的
对于一个原始粗糙数据,当其读取到R以后会有以下几个常见的问题:
-
变量类型错误: 如身份证号识别数值型变量
-
分类变量中有奇怪的值 有NA
本文档的目标就是在数据清洗前,对原始数据进行初步探索,并针对探索结果指导数据清洗
数据读取及保存
#| eval: false #不运行代码
dat1 <- read.csv(paste0(path_data,"simout/mc_info.csv")) #stringsAsFactors=FALSE
fchbp <- read_excel(paste0(path_data,"hbp分层.xlsx"))
load(file = paste0(path_data, "dat_hbp.Rdata"))
tax.tab <- read_csv("Rbookdata/taxsamp.csv", locale=locale(encoding="GB18030")) #推荐
write.table(decom_hbp,paste0(path_data,"decom_hbp.csv"),sep = ",",col.names = TRUE,row.names = FALSE)
write_csv(SES_pr, file = paste(path_data, "table1_explore/SES_pr.csv", sep = ""))
save(dat2, file = paste(path_data, "dat2.Rdata", sep = ""))
数据变量类型展示
数据变量类型展示的自编函数如下:
#| eval: false #不运行代码
check_var_type <- function(input_data){
library('tidyverse')
var_chr <- input_data %>%
map(is.character) %>%
keep(identity) %>%
names()
var_double <- input_data %>%
map(is.double) %>%
keep(identity) %>%
names()
var_int <- input_data %>%
map(is.integer) %>%
keep(identity) %>%
names()
var_factor <- input_data %>%
map(is.factor) %>%
keep(identity) %>%
names()
var_logic <- input_data %>%
map(is.logical) %>%
keep(identity) %>%
names()
list(var_chr=var_chr,
var_double=var_double,
var_int=var_int,
var_factor=var_int,
var_logic=var_logic)
}
利用NHANES数据为例:
#| warning: false
#| message: false
library(NHANES)
data(NHANES)
source('/Users/hecongyuan/Documents/Study/R/R语言教程/function/check_var_type.R')
var_check <- check_var_type(NHANES)
var_check
数据类型转换
批量将所有int转换成factor
#| warning: false
#| message: false
#| eval: false
library('tidyverse')
#批量更改数据类型 根据需求更改
#var_check[["var_int"]]可以写成c('var1','var3',...)挑选变量转换
NHANES <- NHANES %>%
mutate(across(var_check[["var_int"]],as.factor))
数据初步探索
方法一
library(skimr)
skim(NHANES, all_of(names(NHANES)))
方法二
利用gtsummary包
#| warning: false
#| message: false
#| eval: false
explore <- function(crude_data){
library(gtsummary)
table1 <- crude_data %>% tbl_summary(statistic = list(
#all_continuous() ~ "{mean}±{sd}" ,
#all_continuous() ~ "{median} ({p25}, {p75})",
#all_continuous() ~ "{median} ({IQR})",
#all_categorical() ~ "{n} ({p}%)",
all_continuous() ~ "median:{median}
mean:{mean}
sd:{sd}
min:{min}
max:{max}
IQR:{IQR}",
all_categorical() ~ "{n} / {N} ({p}%)"),
digits = list(all_continuous() ~ c(),
all_categorical() ~ c(0,0,2))) %>%
modify_caption("**数据清理前探索**") %>%
bold_labels()
table1
}
source('/Users/hecongyuan/Documents/Study/R/R语言教程/function/explore.R')
explore(NHANES)
总结
- 一般情况下 直接进行skim看数据情况 再依据数据情况进行数据清洗
- 批量数据转化可以记忆一下