下面,我们就讲讲如何用 R
来读取读取常见数据格式文件的方式。
1、txt/csv 格式文件
read.table(file, header = FALSE, sep = "", quote = "\"'",
dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
row.names, col.names, as.is = !stringsAsFactors,
na.strings = "NA", colClasses = NA, nrows = -1,
skip = 0, check.names = TRUE, fill = !blank.lines.skip,
strip.white = FALSE, blank.lines.skip = TRUE,
comment.char = "#",
allowEscapes = FALSE, flush = FALSE,
stringsAsFactors = default.stringsAsFactors(),
fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
read.csv(file, header = TRUE, sep = ",", quote = "\"",
dec = ".", fill = TRUE, comment.char = "", ...)
read.csv2(file, header = TRUE, sep = ";", quote = "\"",
dec = ",", fill = TRUE, comment.char = "", ...)
read.delim(file, header = TRUE, sep = "\t", quote = "\"",
dec = ".", fill = TRUE, comment.char = "", ...)
read.delim2(file, header = TRUE, sep = "\t", quote = "\"",
dec = ",", fill = TRUE, comment.char = "", ...)
上面 5
个函数都可以用于读取表格格式文件。
read.csv
与 read.csv2
相较于 read.table
,除了几个默认参数值不同外,其他参数都是完全一样的。
read.table
:sep
指定的默认分隔符为一个或多个空格,制表符、换行或回车,dec
指定的小数点以.
分隔read.csv
: 读取,
分隔符文件read.csv2
: 读取;
分隔符文件,且小数点以,
分隔read.delim
: 读取\t
分隔符文件read.delim2
: 读取\t
分隔符文件,且小数点以,
分隔
示例
> read.table('Downloads/data.txt')
V1 V2 V3 V4
1 segdup01284 hs3 111883743 111884767
2 segdup02286 hs1 152617218 152618252
3 segdup02286 hs3 111883745 111884756
4 segdup02365 hs1 158502674 158503718
5 segdup02365 hs3 111883744 111884768
6 segdup02574 hs1 199147461 199151487
7 segdup02574 hs3 113384907 113387537
8 segdup02968 hs1 235049483 235050522
9 segdup02968 hs3 111883744 111884768
写出文件
write.table(x, file = "", append = FALSE, quote = TRUE, sep = " ",
eol = "\n", na = "NA", dec = ".", row.names = TRUE,
col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
write.csv(...)
write.csv2(...)
写出文件的函数与读取函数相对应
2. 读取 json 数据
这里我们使用 rjson
模块
2.1 安装
install.packages("rjson")
2.2 使用
数据
[
{
"Name": "Mario",
"Age": 32,
"Occupation": "Plumber",
"Rank": 3
},
{
"Name": "Peach",
"Age": 21,
"Occupation": "Princess",
"Rank": 1
}
]
读取
# 导入模块
library(rjson)
# 读取数据
data <- fromJSON(file = 'Downloads/data.json')
# 查看数据格式
# > str(data)
# List of 2
# $ :List of 4
# ..$ Name : chr "Mario"
# ..$ Age : num 32
# ..$ Occupation: chr "Plumber"
# ..$ Rank : num 3
# $ :List of 4
# ..$ Name : chr "Peach"
# ..$ Age : num 21
# ..$ Occupation: chr "Princess"
# ..$ Rank : num 1
可以看到,json
数据被转换为嵌套的 list
知道怎么读取 json
数据了,那如何写出成 json
格式的文件呢?
例如,我们想更改 Mario
的年龄为 45
,并将 Peach
的等级改为 9
data[[1]]$Age <- 45
data[[2]]$Rank <- 9
# 转换为 json string
outJson <- toJSON(data)
# 保存为 new_data.json
write(outJson, file = "Downloads/new_data.json")
new_data.json
[
{
"Name": "Mario",
"Age": 45,
"Occupation": "Plumber",
"Rank": 3
},
{
"Name": "Peach",
"Age": 21,
"Occupation": "Princess",
"Rank": 9
}
]
3. xml 文件操作
我们以 KEGG
通路 hsa05130
的 kgml
文件为例
# 安装模块
install.packages("XML")
# 导入模块
library(XML)
# 解析 xml 文件
hsa <- xmlParse("Downloads/hsa05130.xml")
# 提取根节点
oot <- xmlRoot(hsa)
# 查看根节点名称
xmlName(root)
# [1] "pathway"
# 查看根节点的子节点数目
xmlSize(root)
# [1] 293
# 查看第一个子节点
root[[1]]
# <entry id="4" name="path:hsa04810" type="map" link="https://www.kegg.jp/dbget-bin/www_bget?hsa04810">
# <graphics name="Regulation of actin cytoskeleton" fgcolor="#000000" bgcolor="#FFFFFF" type="roundrectangle" x="1237" y="777" width="119" height="34"/>
# </entry>
root[[1]][[1]] # 查看第一个子节点的第一个子节点
# <graphics name="Regulation of actin cytoskeleton" fgcolor="#000000" bgcolor="#FFFFFF" type="roundrectangle" x="1237" y="777" width="119" height="34"/>
xmlSApply(root, xmlName) # 根节点的所有子节点名称
xmlSApply(root[[1]], xmlAttrs) # 子节点 1 的所有子节点属性
# graphics
# name "Regulation of actin cytoskeleton"
# fgcolor "#000000"
# bgcolor "#FFFFFF"
# type "roundrectangle"
# x "1237"
# y "777"
# width "119"
# height "34"
xmlSApply(root, xmlSize) # 所有子节点大小
# xpath 语法获取节点属性 id=4 的 entry
getNodeSet(root, "//entry[@id=4]")
# [[1]]
# <entry id="4" name="path:hsa04810" type="map" link="https://www.kegg.jp/dbget-bin/www_bget?hsa04810">
# <graphics name="Regulation of actin cytoskeleton" fgcolor="#000000" bgcolor="#FFFFFF" type="roundrectangle" x="1237" y="777" width="119" height="34"/>
# </entry>
#
# attr(,"class")
# [1] "XMLNodeSet"
# 转换为 list ,转换为 dataframe 可以用 xmlToDataFrame
hsa_list <- xmlToList(root)
# 更改数据
root[[1]][[1]] <- "haha"
# 保存
saveXML(root, file="hsa05130.xml",encoding="UTF-8")
4. 网页文件
4.1 安装包
在 R
中,我们使用 rvest
模块,进行网页解析以及获取网页内容。
# install "rvest" package
install.packages("rvest")
# library
library(rvest)
4.2 解析网页
我们还是以昨天的链接为例
首先用 read_html
读取网页链接
然后用 html_text
读取整个网页内容,返回的是一个字符串
# 网页链接
URL <- "http://rest.kegg.jp/get/cpd:C01290"
# 获取 URL 网页
res <- read_html(URL)
# 读取网页文本
text <- html_text(res)
4.3 提取内容
# 将文本按行分割
# strsplit 返回的是长度为 1 的 list,因此,可以用 unlist 转换为 character
line_list <- unlist(strsplit(text, split = '\n'))
# 新建空 list,用于存储我们的数据
data <- list()
for (i in 1:length(line_list)) {
line <- line_list[i]
# 提取前 12 个字符,substr(x, start, stop)
# 提取 start,stop 指定的 x 中字符的起始和结束位置
prefix <- substr(line, 1, 12)
# 判断是否包含字母数字
if (grepl("\\w+", prefix)) {
# 去除多余的空白字符
key <- sub(pattern = "\\s+", replacement = "", x = prefix)
}
# 获取 line 中位置 13 到末尾的字符,nchar(x) 计算字符串 x 的长度
value <- substr(line, 13, nchar(line))
if (key == "ENTRY") {
# 在这,使用 perl 形式的正则 perl = TRUE,以多个空白字符分隔字符串
data$entry <- unlist(strsplit(value, split = "\\s+", perl = TRUE))[1]
} else if (key == "NAME") {
v <- substr(value, 1, nchar(value)-1)
data$name <- c(data$name, v)
} else if (key == "ENZYME") {
v <- unlist(strsplit(value, split = "\\s+", perl = TRUE))
data$enzyme <- c(data$enzyme, v)
} else if (key == "DBLINKS") {
v = unlist(strsplit(value, ": "))
data$dblinks[v[1]] <- v[2]
}
}
输出提取的信息
> data
$entry
[1] "C01290"
$name
[1] "Lactosylceramide"
[2] "beta-D-Galactosyl-(1->4)-beta-D-glucosyl-(11)-ceramide"
[3] "beta-D-Galactosyl-1,4-beta-D-glucosylceramide"
[4] "Gal-beta1->4Glc-beta1->1'Cer"
[5] "LacCer"
[6] "Lactosyl-N-acylsphingosine"
[7] "D-Galactosyl-1,4-beta-D-glucosylceramid"
$enzyme
[1] "2.4.1.92" "2.4.1.206" "2.4.1.228" "2.4.1.274" "2.4.99.1" "2.4.99.9" "3.2.1.18" "3.2.1.22"
[9] "3.2.1.23" "3.2.1.47" "3.2.1.52"
$dblinks
PubChem ChEBI LIPIDMAPS LipidBank
"4509" "17950" "LMSP0501AB00" "GSG1147"