- log
- 高变基因
- 标准化
- pca
- 构建图
- 聚类
- tsne
seurat流程
# 1.构建对象
min.cells = 0 # min.cells 某一个基因至少在多少个基因中表达
min.features = 0 # min.features 某个细胞至少表达多少个基因
sce = CreateSeuratObject(counts = raw.data,metadata = metadata,min.cells =min.cells,min.features =min.features)
sce = AddMetaData(object = sce,metadata = metadata)
sce = AddMetaData(object = sce, percent.ercc, col.name = "percent.ercc")
# 2.数据清洗
# 用数据框的筛选形式可以对sce进行基因和样本筛选
erccs = grep('^ERCC-', x= rownames(sce),value = T) # value = T 获取名字
rp = grep("^RP[SL][[:digit:]]", x= rownames(sce),value = T) # value = T 获取名字
mt = grep('^MT-', x= rownames(sce),value = T) # value = T 获取名字
sce[["percent.ercc"]] = PercentageFeatureSet(sce, pattern = "^ERCC-")
sce[["percent.rp"]] = PercentageFeatureSet(sce, pattern = "^RP[SL][[:digit:]]")
sce[["percent.mt"]] = PercentageFeatureSet(sce, pattern = "^MT-")
# 3. 筛选
sce = subset(x=sce, subset = nCount_RNA > 50000 & nFeature_RNA > 500)
# seurat 流程
# 1.log
sce = NormalizeData(object = sce,normalization.method = "LogNormalize", scale.factor = 1e6)
# 2.高变基因
sce = FindVariableFeatures(object = sce,selection.method = "vst", nfeatures = 2000)
# 3.标准化
sce = ScaleData(object = sce)
# 4. PCA
sce = RunPCA(object = sce, do.print = FALSE)
# 5.构建图
sce= FindNeighbors(sce, dims = 1:20)
# 6. 聚类
sce = FindClusters(sce, resolution = 0.5)
# 7.tsne
sce=RunTSNE(sce,dims.use = 1:20) ##tsne降维
scanpy流程
# creat scanpy object
df = pd.read_csv('processfile/count.csv', index_col=0)
meta = pd.read_csv('processfile/metadata.csv', index_col=0)
df.columns = meta.index
df = df.T
cellinfo = pd.DataFrame(df.index,index=df.index,columns=['sample_index'])
geneinfo = pd.DataFrame(df.columns,index=df.columns,columns=['genes_index'])
cellinfo = pd.concat([cellinfo,meta],axis=1)
sce = sc.AnnData(df, obs=cellinfo, var = geneinfo)
# 2.数据清洗
sce.var_names_make_unique()
sce.obs_names_make_unique()
sc.pp.filter_cells(sce, min_genes=300)
sc.pp.filter_genes(sce, min_cells=5)
mt = sce.var_names[sce.var_names.str.match(r'^MT-')] # 线粒体DNA
rp = sce.var_names[sce.var_names.str.match(r'^RP[SL][0-9]')] # 核糖体DNA
ercc = sce.var_names[sce.var_names.str.match(r'^ERCC-')] # 外源DNA
sce.var_names[sce.var_names.str.match(r'.*\.[0-9]')] # 匹配有小数点的基因
sce.var['mt'] = sce.var_names.str.match(r'^MT-')
sc.pp.calculate_qc_metrics(sce, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
# 3. 筛选
sce = sce[sce.obs.pct_counts_mt < 15, :]
sce = sce[sce.obs.total_counts < 25000, :]
# stander pipline
# =============================================================================
# 1. 去文库 + log
# 2. 高变基因
# 3. 标准化
# 4. pca
# 5. 构建图
# 6. 聚类
# 7. tsne
# =============================================================================
sc.pp.normalize_total(sce, target_sum=1e6)
sc.pp.log1p(sce)
sc.pp.regress_out(sce, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(sce)
sc.tl.pca(sce, svd_solver='arpack', random_state=42)
sc.pp.neighbors(sce, n_neighbors=50, n_pcs=20, random_state=42)
sc.tl.leiden(sce, resolution=0.3, random_state=42)
sc.tl.tsne(sce, n_pcs=20, n_jobs=30)
sc.pl.tsne(sce, color='leiden')
如果您觉得我的文章对您有帮助,请点赞+关注,可以的话打个赏奖励一杯星巴克(~ ̄(OO) ̄)ブ
Best Regards,
Yuan.SH;
School of Basic Medical Sciences,
Fujian Medical University,
Fuzhou, Fujian, China.
please contact with me via the following ways:
(a) e-mail :yuansh3354@163.com