前面我们也遇到过10x的数据读取不进去,如果大家遇到下面的报错,可以通过修改10x的原始重新读取,详细可以见链接https://blog.csdn.net/weixin_43949246/article/details/121225791
当然,这次跟大家说的是另一种10x的数据,同样的,我们先说报错,如果出现dgTMatrix的报错,我们该怎么办呢?
这里给大家推荐一种解决方法,首先先定义个函数,修改读取的方式
read10x_ymc <- function(
data.dir = NULL,
gene.column = 2,
unique.features = TRUE,
strip.suffix = FALSE,
prefix='',
suffix="tsv"
) {
full.data <- list()
for (i in seq_along(along.with = data.dir)) {
run <- data.dir[i]
if (!dir.exists(paths = run)) {
stop("Directory provided does not exist")
}
barcode.loc <- file.path(run, paste(prefix,'barcodes.',suffix,sep=''))
gene.loc <- file.path(run, paste(prefix,'genes.',suffix,sep=''))
features.loc <- file.path(run, paste(prefix,'features.',suffix,'.gz',sep=''))
matrix.loc=file.path(run,paste(prefix,'matrix.mtx',sep=''))
# Flag to indicate if this data is from CellRanger >= 3.0
pre_ver_3 <- file.exists(gene.loc)
if (!pre_ver_3) {
addgz <- function(s) {
return(paste0(s, ".gz"))
}
barcode.loc <- addgz(s = barcode.loc)
matrix.loc <- addgz(s = matrix.loc)
}
if (!file.exists(barcode.loc)) {
stop("Barcode file missing. Expecting ", basename(path = barcode.loc))
}
if (!pre_ver_3 && !file.exists(features.loc) ) {
stop("Gene name or features file missing. Expecting ", basename(path = features.loc))
}
if (!file.exists(matrix.loc)) {
print(matrix.loc)
stop("Expression matrix file missing. Expecting ", basename(path = matrix.loc))
}
data <- readMM(file = matrix.loc)
cell.names <- readLines(barcode.loc)
if(!dim(data)[2]==length(cell.names)){data <- t(readMM(file = matrix.loc))}###这是与cellranger输出结果最大的不同
if (all(grepl(pattern = "\\-1$", x = cell.names)) & strip.suffix) {
cell.names <- as.vector(x = as.character(x = sapply(
X = cell.names,
FUN = ExtractField,
field = 1,
delim = "-"
)))
}
if (is.null(x = names(x = data.dir))) {
if (i < 2) {
colnames(x = data) <- cell.names
} else {
colnames(x = data) <- paste0(i, "_", cell.names)
}
} else {
colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names)
}
feature.names <- read.delim(
file = ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc),
header = FALSE,
stringsAsFactors = FALSE
)
if (any(is.na(x = feature.names[, gene.column]))) {
warning(
'Some features names are NA. Replacing NA names with ID from the opposite column requested',
call. = FALSE,
immediate. = TRUE
)
na.features <- which(x = is.na(x = feature.names[, gene.column]))
replacement.column <- ifelse(test = gene.column == 2, yes = 1, no = 2)
feature.names[na.features, gene.column] <- feature.names[na.features, replacement.column]
}
if (unique.features) {
fcols = ncol(x = feature.names)
if (fcols < gene.column) {
stop(paste0("gene.column was set to ", gene.column,
" but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.",
" Try setting the gene.column argument to a value <= to ", fcols, "."))
}
rownames(x = data) <- make.unique(names = feature.names[, gene.column])
}
# In cell ranger 3.0, a third column specifying the type of data was added
# and we will return each type of data as a separate matrix
if (ncol(x = feature.names) > 2) {
data_types <- factor(x = feature.names$V3)
lvls <- levels(x = data_types)
if (length(x = lvls) > 1 && length(x = full.data) == 0) {
message("10X data contains more than one type and is being returned as a list containing matrices of each type.")
}
expr_name <- "Gene Expression"
if (expr_name %in% lvls) { # Return Gene Expression first
lvls <- c(expr_name, lvls[-which(x = lvls == expr_name)])
}
data <- lapply(
X = lvls,
FUN = function(l) {
return(data[data_types == l, , drop = FALSE])
}
)
names(x = data) <- lvls
} else{
data <- list(data)
}
full.data[[length(x = full.data) + 1]] <- data
}
list_of_data <- list()
for (j in 1:length(x = full.data[[1]])) {
list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, FUN = `[[`, j))
# Fix for Issue #913
print('yaomengcheng')
list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "dgCMatrix")
}
names(x = list_of_data) <- names(x = full.data[[1]])
# If multiple features, will return a list, otherwise
# a matrix.
if (length(x = list_of_data) == 1) {
return(list_of_data[[1]])
} else {
return(list_of_data)
}
}
这时候重新读取进去,这样问题就可以解决
my.data<-read10x_ymc(data.dir = 'L1/',gene.column =2, prefix='',suffix="tsv")
sce=CreateSeuratObject(counts = my.data, project = 'L1',
#每个基因至少在3个细胞中表达,每一个细胞至少有250个基因表达
min.cells = 3, min.features = 250)
其实这里出现的问题,最主要是出现在dgCMatrix包这个问题上,而R语言中dgCMatrix包是用来专门大型稀疏矩阵的,当然大家可选择性重新安装Matrix包,但是不一定能解决问题,最好的方式,可以通过上述的方法解决该问题。有问题的小伙伴也可以私信我哈!
############新增,推荐一个在线分析的生信分析差异工具######
差异分析工具:http://www.sxdyc.com/singleCollectionTool?href-diff