Unzip file
> library(utils)
> unzip("rprog-data-ProgAssignment3-data.zip")
Finding the best hospital in a state
best <- function(state, outcome){
## state is the 2-character abbreviated name
## outcome is the outcome name
## read outcome data
file <- read.csv("outcome-of-care-measures.csv", colClasses = "character")
outcomevector <- c("heart attack", "heart failure", "pneumonia")
## check that state and outcome are valid
statename <- unique(c(file[, "State"]))
if (!state %in% statename)
stop("invalid state") # stop:停止执行当前表达式,并且输出函数中的字符
if (!outcome %in% outcomevector)
stop("invalid outcome")
# return the specific column correspond to the outcome
## 通过数字提取列
hospitalrate <- c(11,17,23)
hospitalratecol <- hospitalrate[match(outcome, outcomevector)]
## return hospital name in that state with lowest 30-day death
df1 <- file[file$State==state, ]
hospital <- which.min(as.numeric(na.omit(df1[, hospitalratecol]))) # which.min 提取最小值所在行
df1[hospital, "Hospital.Name"]
}
match: 匹配两个向量,返回x中存在的返回索引或TRUE、FALSE
> x <- colnames(diamonds)[1:5]
> x
[1] "carat" "cut" "color" "clarity" "depth"
> y <- colnames(diamonds)[1:10]
> y
[1] "carat" "cut" "color" "clarity" "depth" "table" "price" "x" "y"
[10] "z"
1.使用match函数找出x中每个元素在y中的位置
> match(x,y)
[1] 1 2 3 4 5
Ranking hospitals by outcome in a state
rankhospital <- function(state, outcome, num) {
## state is the 2-character abbreviated name of a state
## num is the ranking of a hospital in that state for that outcome
## Read outcome data
file <- read.csv("outcome-of-care-measures.csv", colClasses = "character", na.strings = "Not Available" )
namesvector <- names(file)
outcomevector <- c("heart attack", "heart failure", "pneumonia")
## check that state and outcome are valid
statename <- unique(c(file[, "State"]))
if (!state %in% statename)
stop("invalid state") # stop:停止执行当前表达式,并且输出函数中的字符
if (!outcome %in% outcomevector)
stop("invalid outcome")
# return the specific column correspond to the outcome
## 通过数字提取列
hospitalrate <- c(11,17,23)
hospitalratecol <- hospitalrate[match(outcome, outcomevector)]
Rate <- namesvector[hospitalratecol]
## return hospital name in that state with lowest 30-day death
col <- c("Hospital.Name", Rate)
df1 <- file[file$State==state, col]
## Return hospital name in that state with the given rank
## 30-day death rate
ordereddf <- df1[order(as.numeric(df1[, 2]), df1[,1], na.last = NA, decreasing = FALSE), ]
## remove the NA, so the num should change
## else if not elif
if (num == "worst"){
num <- nrow(ordereddf)
} else if (num == "best"){
num <- 1
} else{
num <- num
}
ordereddf[num, 1]
}
Ranking hospitals in all states
return a specific hospital name for each stae
rankall <- function(outcome, num = 'best'){
## num is a hospital ranking
## Read outcome data
file <- read.csv("outcome-of-care-measures.csv", colClasses = "character", na.strings = "Not Available" )
outcomevector <- c("heart attack", "heart failure", "pneumonia")
namesvector <- names(file)
## Check that state and outcome are valid
statename <- sort(unique(c(file[, "State"])))
if (!outcome %in% outcomevector)
stop("invalid outcome")
# return the specific column correspond to the outcome
## 通过数字提取列
hospitalrate <- c(11,17,23)
hospitalratecol <- hospitalrate[match(outcome, outcomevector)]
Rate <- namesvector[hospitalratecol]
## return hospital name in that state with lowest 30-day death
df1 <- file[, c("Hospital.Name", 'State', Rate)]
colnames(df1)[3] <- 'Rate'
## create a new dataframe
df2 <- data.frame()
for (state in statename) {
## create a new dataframe for each state
dfstate <- subset(df1, State == state)
## order the dataframe by rate and then hospital name
## For each state, find the hospital of the given rank
dfstate <- dfstate[order(dfstate$Rate, dfstate$Hospital.Name, decreasing = FALSE), ]
if (num == "worst"){
num <- nrow(dfstate)
} else if (num == "best"){
num <- 1
} else{
num <- num
}
## get the hospital name
hospitalname <- dfstate[num,"Hospital.Name"]
## craete a new row with hospital name and its state
newdf <- data.frame(hospitalname, state)
## colbind to the empty datafra
df2 <- rbind(newdf, df2)
}
df2 <- df2[order(df2$state, decreasing = FALSE), ]
df2
}
subset
subset函数,从某一个数据框中选择出符合某条件的数据或是相关的列
another solution (unfinished)
using split and lapply, to be continued…,
rankall <- function(outcome, num = 'best'){
## read the data
## num is a hospital ranking
## Read outcome data
file <- read.csv("outcome-of-care-measures.csv", colClasses = "character", na.strings = "Not Available" )
outcomevector <- c("heart attack", "heart failure", "pneumonia")
namesvector <- names(file)
## Check that state and outcome are valid
statename <- unique(c(file[, "State"]))
if (!outcome %in% outcomevector)
stop("invalid outcome")
## subset to three columns
# return the specific column correspond to the outcome
## 通过数字提取列
hospitalrate <- c(11,17,23)
hospitalratecol <- hospitalrate[match(outcome, outcomevector)]
Rate <- namesvector[hospitalratecol]
## return hospital name in that state with lowest 30-day death
df1 <- file[, c("Hospital.Name", 'State', Rate)]
## remove NA values
df2 <- na.omit(df1$Rate)
## order by state then outcome then hospital name
df3 <- df2[order(df2[,2],as.numeric(df2[,3]),df2[,1], decreasing = FALSE), ]
## split by state
s <- split(df3, df3$State)
## run lapply
}
hints
## The function should be defined within your assignment function
assignment_function <- function() {
lapply_function <- function() {
## do something
if(condition1) do something
if(condition2) do something else
return a value
}
}
## If you order properly the worst hospital will always be the last one in the state subset. Just be sure you've done things in the correct order prior to running split and lapply -
## Read the data
## Subset to three columns
## Remove NA Values
## Order by state then outcome then hospital name
## Split by state
## Run lapply
## Your function for lapply should take one parameter as the data frame for a state and output a hospital name. The results of lapply will be a named list where the list names are state and the list values are hospital name (one for each state).
## The following function receives a state data frame from the split data
## function_for_lapply(data) { do something with data }