# GSE_Expression_Matrix.R
# 整理下载的数据
# 指针换基因名
工程文件目录:

项目文件目录:

rm(list = ls())
gc()
options(stringsAsFactors = F)
# 设置链接超时时间
options(timeout=200000)
# 设置R的使用内存
Sys.setenv("VROOM_CONNECTION_SIZE"=99999999)
## 加载自定义R包
source("./R_Function/R_Packages_Manage.R",encoding="utf-8")
rpackages <- c('GEOquery','tidyverse','magrittr','readxl','limma','dplyr','stringr','data.table','impute','org.Hs.eg.db','AnnotationDbi')
pkgs_in(rpackages)
inputPath <- "./0_GEO/2_GSE_Expression_Matrix/1_Input/"
outputPath <- "./0_GEO/2_GSE_Expression_Matrix/2_Output/"
# GSE_NO <- read.delim(paste0(inputPath, "GSE_NO.txt"), header = FALSE, stringsAsFactors = FALSE)
# GPL_NO <- read.delim(paste0(inputPath, "GPL_NO.txt"), header = FALSE, stringsAsFactors = FALSE)
# 使用 list.files 列出目录中的文件,并使用正则表达式进行匹配过滤
gse_files <- list.files(path = inputPath, pattern = "^GSE.*\\.gz$", full.names = FALSE)
gse_files <- as.data.frame(gse_files) %>%
mutate(across(everything(), ~ str_split_fixed(., "_", 2)[,1])) %>%
mutate(across(everything(), ~ str_split_fixed(., "-", 2)[,1])) %>%
set_colnames("GSE_NO") %>% unique()
results <- data.frame()
results[] <- list(NULL)
find_matching_columns <- function(df, pattern) {
# 分割列名为单词列表
column_words <- strsplit(names(df), "\\s+|_|-|\\.|,")
# 定义一个向量来存储匹配的列名和它们的匹配分数
matched_columns <- list()
# 对每一列进行检查
for (i in seq_along(column_words)) {
if(length(column_words[[i]]) >2) next
words <- column_words[[i]]
match_count <- sum(grepl(pattern, words, ignore.case = TRUE))
if (match_count > 0) {
matched_columns[[names(df)[i]]] <- match_count
}
}
# 按照匹配词的数量进行排序(降序)
sorted_columns <- names(sort(unlist(matched_columns), decreasing = TRUE))
return(sorted_columns)
}
for(GSE_NO1 in gse_files$GSE_NO){
#

最低0.47元/天 解锁文章
1423

被折叠的 条评论
为什么被折叠?



