# R语言文本读取异常处理
# 分类:垃圾邮件过滤
# 案例来源:
# 《机器学习:实用案例解析》 第三章
# 处理过程:读取文件下所有的邮件,提取邮件正文
# 并将其各自合并在一个向量中放置,最后汇总
# 到一个向量中。
# 目的:由于书中的代码实际运行时,会暴露出多个错误和
# 警告,解决此类问题,同时满足尽可能的不丢弃数据信息
# 分类:垃圾邮件过滤
# 案例来源:
# 《机器学习:实用案例解析》 第三章
# 处理过程:读取文件下所有的邮件,提取邮件正文
# 并将其各自合并在一个向量中放置,最后汇总
# 到一个向量中。
# 目的:由于书中的代码实际运行时,会暴露出多个错误和
# 警告,解决此类问题,同时满足尽可能的不丢弃数据信息
rm(list=ls())
library('tm')
library('ggplot2')
spam.path <- file.path("data", "spam")
spam.docs <- dir(spam.path)
spam.docs <- spam.docs[which(spam.docs != "cmds")]
spam.docs <- spam.docs[which(spam.docs != "cmds")]
#问题代码:
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
# The message always begins after the first full line break
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg, collapse = "\n"))
}
#问题原因:path指示的文件中,有字符无法读取
######################## 几种解决策略
# exp1
tryCatch({
get.msg(file.path(spam.path, spam.docs[6]))
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
# exp2
get.msg <- function(path)
{
tryCatch({
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
# The message always begins after the first full line break
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con) #与前面的con要对应
return(paste(msg, collapse = "\n"))
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
}
get.msg <- function(path)
{
tryCatch({
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
# The message always begins after the first full line break
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con) #与前面的con要对应
return(paste(msg, collapse = "\n"))
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
}
get.msg(file.path(spam.path, spam.docs[6]))
# exp3-1(反例,未正确捕捉异常错误的位置)
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con) #注意:异常发生的真实位置
msg <- tryCatch({
# The message always begins after the first full line break
text[seq(which(text == "")[1] + 1, length(text), 1)]
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
close(con) #与前面的con要对应
return(paste(msg, collapse = "\n"))
}
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con) #注意:异常发生的真实位置
msg <- tryCatch({
# The message always begins after the first full line break
text[seq(which(text == "")[1] + 1, length(text), 1)]
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
close(con) #与前面的con要对应
return(paste(msg, collapse = "\n"))
}
# exp3-2(正例,未正确捕捉异常错误的位置)
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
msg <- tryCatch({
# The message always begins after the first full line break
text <- readLines(con) #注意:异常发生的真实位置
text[seq(which(text == "")[1] + 1, length(text), 1)]
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
close(con) #与前面的con要对应
return(paste(msg, collapse = "\n"))
}
get.msg(file.path(spam.path, spam.docs[6]))
# exp4-1
# exp4-1
# 匿名函数块
path <- file.path(spam.path, spam.docs[6])
path <- file.path(spam.path, spam.docs[6])
text <- NULL
con <- file(path, open = "rt", encoding = "latin1")
line <- readLines(con, n=1)
i=1
while(length(line)!=0){
line <- tryCatch({
readLines(con, n=1)},
error=function(e){"err"}, #或者将其复制为NULL,或""
warning=function(w){"war"},
finally={"final"}
)
text <- c(text,line)
cat(i,' : ', line, '\n')
i=i+1
}
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
paste(msg,collapse = "\n")
# exp4-2(比较完美的解决!)
# 函数块
path <- file.path(spam.path, spam.docs[6])
path <- file.path(spam.path, spam.docs[6])
get.msg <- function(path){
text <- NULL
con <- file(path, open = "rt", encoding = "latin1")
line <- readLines(con, n=1)
while(length(line)!=0){
line <- tryCatch({
readLines(con, n=1)},
error=function(e){NULL}, #或者将其复制为NULL,或""
warning=function(w){NULL},
finally={NULL}
)
text <- c(text,line)
}
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg,collapse = "\n"))
}
text <- NULL
con <- file(path, open = "rt", encoding = "latin1")
line <- readLines(con, n=1)
while(length(line)!=0){
line <- tryCatch({
readLines(con, n=1)},
error=function(e){NULL}, #或者将其复制为NULL,或""
warning=function(w){NULL},
finally={NULL}
)
text <- c(text,line)
}
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg,collapse = "\n"))
}
path <- file.path(spam.path, spam.docs[6])
get.msg(path)
# exp5
# 进一步测试
all.spam <- sapply(spam.docs[35],
function(p) get.msg(file.path(spam.path, p)))
# msg <- text[seq(which(text == "")[1] + 1, length(text), 1)],会出现无法满足条件,报错
# 进一步测试
all.spam <- sapply(spam.docs[35],
function(p) get.msg(file.path(spam.path, p)))
# msg <- text[seq(which(text == "")[1] + 1, length(text), 1)],会出现无法满足条件,报错
path <- file.path(spam.path, spam.docs[35])
get.msg <- function(path){
text <- NULL
con <- file(path, open = "rt", encoding = "latin1")
line <- readLines(con, n=1)
while(length(line)!=0){
line <- tryCatch({
readLines(con, n=1)},
error=function(e){NULL}, #或者将其复制为NULL,或""
warning=function(w){NULL},
finally={NULL}
)
text <- c(text,line)
}
#msg <- text[seq(which(text == "")[1] + 1, length(text), 1)] #问题代码
if(length(which(text == ""))!=0 & which(text == "")[1]<length(text)){
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
}else{
msg <- NULL
}
close(con)
return(paste(msg,collapse = "\n"))
}
all.spam <- sapply(spam.docs,
function(p) get.msg(file.path(spam.path, p)))