走进NHANES（1）

最新推荐文章于 2025-03-26 17:48:43 发布
weixin_49320263
最新推荐文章于 2025-03-26 17:48:43 发布
阅读量635
点赞数
分类专栏：公共数据库文章标签：数据库
原文链接：https://wwwn.cdc.gov/nchs/nhanes/tutorials/SampleCode.aspx
版权
公共数据库专栏收录该内容
4 篇文章
订阅专栏
NHANES Tutorials - Sample Code Modulehttps://wwwn.cdc.gov/nchs/nhanes/tutorials/SampleCode.aspx
NHANES官网提供了数据分析的R代码，下面我们一起看看官网的分析代码。
#导入包
library(dplyr)
library(survey)
#设置为全局变量
options(survey.lonely.psu='adjust')
#展示R包版本
cat("R package versions:\n")
for (p in c("base", "survey","dplyr")) { 
  cat(p, ": ", as.character(packageVersion(p)), "\n")
}

#数据准备---下载
# Demographic 的缩写是DEMO
getwd()
download.file("https://wwwn.cdc.gov/nchs/nhanes/2013-2014/DEMO_H.XPT", tf <- tempfile(), mode="wb")
DEMO_H <- foreign::read.xport(tf)[,c("SEQN","RIAGENDR","RIDAGEYR","SDMVSTRA","SDMVPSU","WTMEC2YR")]
download.file("https://wwwn.cdc.gov/nchs/nhanes/2015-2016/DEMO_I.XPT", tf <- tempfile(), mode="wb")
DEMO_I <- foreign::read.xport(tf)[,c("SEQN","RIAGENDR","RIDAGEYR","SDMVSTRA","SDMVPSU","WTMEC2YR")]
#SEQN:编号 SDMVPSU:集群，抽样单元 WTINT2YR:权重 SDMVSTRA:分层 RIAGENDR:性别 RIDAGEYR:年龄 mortstat:死亡与否

#心理健康- 抑郁症筛查(DPQ) 
download.file("http://wwwn.cdc.gov/nchs/nhanes/2013-2014/DPQ_H.XPT", tf <- tempfile(), mode="wb")
DPQ_H <- foreign::read.xport(tf)
download.file("http://wwwn.cdc.gov/nchs/nhanes/2015-2016/DPQ_I.XPT", tf <- tempfile(), mode="wb")
DPQ_I <- foreign::read.xport(tf)

#合并数据
DEMO <- bind_rows(DEMO_H, DEMO_I)
DPQ <- bind_rows(DPQ_H, DPQ_I)

#合并DEMO和DPQ文件，创建变量
One <- left_join(DEMO, DPQ, by="SEQN") %>%#左连接意味着保留第一个数据框（在这里是DEMO）中的所有行，并添加第二个数据框（在这里是DPQ）中匹配的列的值。如果第二个数据框中没有匹配的行，则结果中的相应列将包含NA（表示缺失值）。
  # Set 7=Refused and 9=Don't Know To Missing for variables DPQ010 thru DPQ090 ##
  mutate_at(vars(DPQ010:DPQ090), ~ifelse(. >=7, NA, .)) %>%
  mutate(. , 
         # 用于总体总结的标志
         one = 1,
         # 计算 depression score ，利用如下变量 DPQ010 -- DPQ090
         Depression.Score = rowSums(select(. , DPQ010:DPQ090)),
         # 将上面的计数资料转化为分类变量资料
         Depression= ifelse(Depression.Score >=10, 100, 0), 
         # 转化为因子
         Gender = factor(RIAGENDR, labels=c("Men", "Women")),
         Age.Group = cut(RIDAGEYR, breaks=c(-Inf,19,39,59,Inf),labels=c("Under 20", "20-39","40-59","60 and over")),#使用了cut()函数来创建一个新的因子变量Age.Group，该变量根据RIDAGEYR（可能是年龄变量）的值将观测值分配到不同的年龄组。
         # Generate 4-year MEC weight (Divide weight by 2 because we are appending 2 survey cycles)                                                             
         # Note: using the MEC Exam Weights (WTMEC2YR), per the analytic notes on the               
         #       Mental Health - Depression Screener (DPQ_H) documentation                          
         WTMEC4YR = WTMEC2YR/2 ,
         # 定义分析感兴趣人群的指标：20岁及以上具有有效抑郁评分的成年人
         inAnalysis= (RIDAGEYR >= 20 & !is.na(Depression.Score))#只有当 RIDAGEYR 大于或等于20且 Depression.Score 不是 NA 时，对应的 inAnalysis 的元素才会是 TRUE
  ) %>% 
  # drop DPQ variables
  select(., -starts_with("DPQ"))#从当前数据框中选择所有列，但排除那些列名以'DPQ'开头的列。

#定义调查设计
# Define survey design for overall dataset
NHANES_all <- svydesign(data=One, id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC4YR, nest=TRUE)
#id=~SDMVPSU：这指定了初级抽样单位（Primary Sampling Unit，PSU）的变量。在NHANES中，PSU通常是一个地理区域或一组家庭。~是一个公式符号，用于指定设计元素。
#strata=~SDMVSTRA：这指定了分层的变量。分层抽样是一种抽样技术，其中总体被划分为几个子总体（或层），然后从每个子总体中独立抽样。SDMVSTRA变量可能包含这些层的标识。
#weights=~WTMEC4YR：这指定了每个观测值的权重。在NHANES中，权重通常用于调整由于抽样设计而导致的样本选择偏差。WTMEC4YR可能是四年期的权重变量。
#nest=TRUE：这表示样本设计是嵌套的，也就是说，某些观测值（如个人）嵌套在更大的单位（如家庭或地理区域）内。在某些复杂的抽样设计中，考虑嵌套性是很重要的。

# 为感兴趣的子集创建一个调查设计对象：20岁及以上具有有效抑郁评分的成年人
# Subsetting the original survey design object ensures we keep the design information about the number of clusters and strata
NHANES <- subset(NHANES_all, inAnalysis)

#分析
#定义一个函数来调用svymean和未加权计数
getSummary <- function(varformula, byformula, design){
  # Get mean, stderr, and unweighted sample size
  c <- svyby(varformula, byformula, design, unwtd.count ) #svyby函数用于按byformula指定的变量对varformula指定的变量进行分组，并计算每个组的未加权样本量。结果存储在c`中。
  p <- svyby(varformula, byformula, design, svymean ) #计算每个组的均值和标准误。结果存储在p中。
  outSum <- left_join(select(c,-se), p) 
  outSum
}

#按性别、年龄组、年龄和性别计算抑郁症的总体患病率
#' Adults
getSummary(~Depression, ~one, NHANES)
#' By sex
getSummary(~Depression, ~Gender, NHANES)
#' By age
getSummary(~Depression, ~Age.Group, NHANES)
#' By sex and age
getSummary(~Depression, ~Gender + Age.Group, NHANES)

#比较男女之间的患病率
svyttest(Depression~Gender, NHANES)$p.value %>% as.numeric 
svyttest(Depression~Gender, subset(NHANES, Age.Group=="20-39"))$p.value %>% as.numeric
svyttest(Depression~Gender, subset(NHANES, Age.Group=="40-59"))$p.value %>% as.numeric
svyttest(Depression~Gender, subset(NHANES, Age.Group=="60 and over"))$p.value %>% as.numeric


#两两t检验
# 所有成年人的年龄组差异
# svyttest命令的完整输出
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="20-39" | Age.Group=="40-59"))
# 仅显示p值
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="20-39" | Age.Group=="40-59"))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="20-39" | Age.Group=="60 and over"))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="40-59" | Age.Group=="60 and over"))$p.value %>% as.numeric
#不同年龄组男性之间的差异
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Men" & (Age.Group=="20-39" | Age.Group=="40-59")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Men" & (Age.Group=="20-39" | Age.Group=="60 and over")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Men" & (Age.Group=="40-59" | Age.Group=="60 and over")))$p.value %>% as.numeric
#不同年龄组男性之间的差异
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Women" & (Age.Group=="20-39" | Age.Group=="40-59")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Women" & (Age.Group=="20-39" | Age.Group=="60 and over")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Women" & (Age.Group=="40-59" | Age.Group=="60 and over")))$p.value %>% as.numeric
上述代码实现对比2013-2016年美国20岁及以上成年人抑郁症患病率。