R语言文本读取异常处理

最新推荐文章于 2023-03-30 15:45:26 发布

山谷來客

最新推荐文章于 2023-03-30 15:45:26 发布

阅读量2.4k

点赞数

分类专栏： R 机器学习文本挖掘文章标签：异常处理 r语言文本挖掘

本文链接：https://blog.csdn.net/u010035907/article/details/50936232

版权

机器学习同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

6 篇文章 0 订阅

订阅专栏

文本挖掘

1 篇文章 0 订阅

订阅专栏

#        R语言文本读取异常处理
# 分类：垃圾邮件过滤
# 案例来源：
# 《机器学习：实用案例解析》第三章
# 处理过程：读取文件下所有的邮件，提取邮件正文
#      并将其各自合并在一个向量中放置，最后汇总
#      到一个向量中。
# 目的：由于书中的代码实际运行时，会暴露出多个错误和
#      警告，解决此类问题，同时满足尽可能的不丢弃数据信息

rm(list=ls())
library('tm')
library('ggplot2')

spam.path <- file.path("data", "spam")

spam.docs <- dir(spam.path)
spam.docs <- spam.docs[which(spam.docs != "cmds")]

#问题代码：
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
# The message always begins after the first full line break
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg, collapse = "\n"))
}

#问题原因：path指示的文件中，有字符无法读取

######################## 几种解决策略
# exp1
tryCatch({
get.msg(file.path(spam.path, spam.docs[6]))
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)

# exp2
get.msg <- function(path)
{
tryCatch({
    con <- file(path, open = "rt", encoding = "latin1")
    text <- readLines(con)
    # The message always begins after the first full line break
    msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
    close(con) #与前面的con要对应
    return(paste(msg, collapse = "\n"))
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
}

get.msg(file.path(spam.path, spam.docs[6]))

# exp3-1（反例，未正确捕捉异常错误的位置）

get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con) #注意：异常发生的真实位置
msg <- tryCatch({
# The message always begins after the first full line break
text[seq(which(text == "")[1] + 1, length(text), 1)]
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
close(con) #与前面的con要对应
return(paste(msg, collapse = "\n"))
}

# exp3-2（正例，未正确捕捉异常错误的位置）
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
msg <- tryCatch({
    # The message always begins after the first full line break
    text <- readLines(con) #注意：异常发生的真实位置
    text[seq(which(text == "")[1] + 1, length(text), 1)]
},
warning=function(war){"war"},
error=function(err){"erro"},
finally={"fina"}
)
close(con) #与前面的con要对应
return(paste(msg, collapse = "\n"))
}

get.msg(file.path(spam.path, spam.docs[6]))

# exp4-1

# 匿名函数块
path <- file.path(spam.path, spam.docs[6])

text <- NULL
con <- file(path, open = "rt", encoding = "latin1")
line <- readLines(con, n=1)
i=1
while(length(line)!=0){
line <- tryCatch({
    readLines(con, n=1)},
    error=function(e){"err"}, #或者将其复制为NULL,或""
    warning=function(w){"war"},
    finally={"final"}
)
text <- c(text,line)
cat(i,' : ', line, '\n')
i=i+1
}
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
paste(msg,collapse = "\n")

# exp4-2(比较完美的解决！)

# 函数块
path <- file.path(spam.path, spam.docs[6])

get.msg <- function(path){
text <- NULL
con <- file(path, open = "rt", encoding = "latin1")
line <- readLines(con, n=1)
while(length(line)!=0){
    line <- tryCatch({
      readLines(con, n=1)},
      error=function(e){NULL}, #或者将其复制为NULL,或""
      warning=function(w){NULL},
      finally={NULL}
    )
    text <- c(text,line)
}
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg,collapse = "\n"))
}

path <- file.path(spam.path, spam.docs[6])

get.msg(path)

# exp5
# 进一步测试
all.spam <- sapply(spam.docs[35],
function(p) get.msg(file.path(spam.path, p)))
# msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]，会出现无法满足条件，报错

path <- file.path(spam.path, spam.docs[35])

get.msg <- function(path){
text <- NULL
con <- file(path, open = "rt", encoding = "latin1")
line <- readLines(con, n=1)
while(length(line)!=0){
    line <- tryCatch({
      readLines(con, n=1)},
      error=function(e){NULL}, #或者将其复制为NULL,或""
      warning=function(w){NULL},
      finally={NULL}
    )
    text <- c(text,line)
}
#msg <- text[seq(which(text == "")[1] + 1, length(text), 1)] #问题代码
if(length(which(text == ""))!=0 & which(text == "")[1]<length(text)){
    msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
}else{
    msg <- NULL
}
close(con)
return(paste(msg,collapse = "\n"))
}

all.spam <- sapply(spam.docs,
function(p) get.msg(file.path(spam.path, p)))