spateo 构建bin_50 分析数据
exec('import warnings; warnings.filterwarnings("ignore")')
import dynamo as dyn
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import spateo as st
st.config.n_threads = 4
st.__version__
Load data
读取 BGI 的空间矩阵为AnnData并构建 bin50
adata = st.io.read_bgi("./E9.5_E2S2_GEM_bin1.tsv.gz",binsize=50)
adata
|-----> Using binsize=50
|-----> Constructing count matrices.
|-----> <insert> __type to uns in AnnData Object.
|-----> <insert> pp to uns in AnnData Object.
|-----> <insert> spatial to uns in AnnData Object.
AnnData object with n_obs × n_vars = 4332 × 24107
obs: 'area'
uns: '__type', 'pp', 'spatial'
obsm: 'spatial', 'contour', 'bbox'
绘制细胞细胞空间位置散点图
st.pl.space(adata, color=['area'], pointsize=0.1, show_legend="upper left", cmap="tab20") ### 数据来源于adata.obsm["spatial"]
QC
st.pp.filter.filter_cells(adata,min_expr_genes=500,inplace=True) ### 过滤表达基因数小于 500的细胞
st.pp.filter.filter_genes(adata, min_cells=3,inplace=True) ### 过滤 无效表达的基因
adata.var['mt'] = adata.var_names.str.startswith('mt-') ### 标记线粒体基因
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) ### 计算线粒体基因表达比例
adata = adata[adata.obs.pct_counts_mt < 5, :] ### 过滤 线粒体基因高表达 的细胞
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) ### 过滤 线粒体基因高表达 的细胞
adata.layers["raw"] = adata.X ### 把原始矩阵拷贝到adata的"raw"层
Spatial Clustering
### log transform & dimension reduction
adata.X = adata.layers["raw"] ### 从layer层提取 UMI counts matrix
dyn.pp.normalize_cell_expr_by_size_factors(adata, layers="X")
dyn.pp.log1p(adata)
st.tl.pca_spateo(adata, n_pca_components=50)
dyn.tl.neighbors(adata, n_neighbors=30)
### 使用空间约束聚类(st.tl.ssc)可以识别连续的组织区域
st.tl.scc(adata, s_neigh=8, resolution=1, cluster_method="louvain", key_added="scc", pca_key="X_pca")
### 绘制聚类点图 & 输出图片为 pdf
st.pl.space(adata, color=['scc'], pointsize=0.2, show_legend="upper left") ### 显示图片
st.pl.space(adata, color=['scc'], pointsize=0.2, show_legend="upper left",save_show_or_return='save',save_kwargs={"path":"bin50_scc_cluster", "ext":"pdf", "dpi":300, "width":1,"heigth":1})
Spatial Autocorrelation Features Selection
### 执行 moran'I test 检测空间自相关基因进行特征选择: 这一步会有点耗时
m = st.tl.spatial_degs.moran_i(adata,n_jobs=8)
### Feature selection
m_filter = m[(m.moran_p_val < 0.05)&(m.moran_q_val<0.05)].sort_values(by=['moran_i'],ascending=False)
m_filter.to_csv("mouse_brain_morani_filter.csv")
print(m_filter)
### 可视化莫兰指数 top5 和 bottom 5 的基因表达
st.pl.space(adata, genes=m_filter.index[0:5].tolist() + m_filter.index[-6:-1].tolist(),pointsize=0.4,ncols=5,show_legend="upper left",figsize=(8,8))
Cluster degs
### 计算 实验组与对照组 的差异基因
st.tl.find_cluster_degs(adata,group='scc',test_group='4',control_groups=['0','1','2'],genes=None,method='pairwise')
### 计算 所有簇之间的 差异基因
st.tl.find_all_cluster_degs(adata,group='scc',genes=None,n_jobs=8,copy=False)
all_markers = pd.concat(adata.uns["cluster_markers"]["deg_tables"], ignore_index=True, sort=False)
all_markers.to_csv("mouse_brain_all_markers.csv")
print(all_markers)
Spatial region degs
### 计算空间区域 marker ,该功能用以分析 与test-cluster在空间分布上临近的clusters之间的差异基因,针对特定项目需求
st.tl.find_spatial_cluster_degs(adata=adata , test_group='4',group='scc',k = 15 , ratio_thresh =0.25)
Save object
import joblib
joblib.dump(adata,"adata_bin50_cluster.pkl")