# GSE_Expression_Matrix_GPLtxt.R
工程文件目录:
项目文件目录:
# 整理下载的数据
# 指针换基因名(根据GPLtxt)
rm(list = ls())
gc()
options(stringsAsFactors = F)
# 设置链接超时时间
options(timeout=200000)
# 设置R的使用内存
# Sys.setenv("VROOM_CONNECTION_SIZE"=99999999)
## 加载自定义R包
source("./R_Function/R_Packages_Manage.R",encoding="utf-8")
pkgs_in(c('GEOquery','tidyverse','magrittr','readxl','limma','dplyr','stringr','data.table'))
inputPath <- "./0_GEO/2_GSE_Expression_Matrix_GPLtxt/1_Input/"
outputPath <- "./0_GEO/2_GSE_Expression_Matrix_GPLtxt/2_Output/"
# GSE_NO <- read.delim(paste0(inputPath, "GSE_NO.txt"), header = FALSE, stringsAsFactors = FALSE)
# GPL_NO <- read.delim(paste0(inputPath, "GPL_NO.txt"), header = FALSE, stringsAsFactors = FALSE)
# 使用 list.files 列出目录中的文件,并使用正则表达式进行匹配过滤
gse_files <- list.files(path = inputPath, pattern = "^GSE.*\\.gz$", full.names = FALSE)
gse_files <- as.data.frame(gse_files) %>%
mutate(across(everything(), ~ str_split_fixed(., "_", 2)[,1])) %>%
mutate(across(everything(), ~ str_split_fixed(., "-", 2)[,1])) %>%
set_colnames("GSE_NO") %>% unique()
results <- data.frame()
results[] <- list(NULL)
find_matching_columns <- function(df, pattern) {
# 分割列名为单词列表
column_words <- strsplit(names(df), "\\s+|_|-|\\.|,")
# 定义一个向量来存储匹配的列名和它们的匹配分数
matched_columns <- lis