作者,Evil Genius
目前分析scATAC的几款软件包括signac(R版本)、ArchR(R版本)、epiScanpy(python版本)、 SnapATAC2(python版本),各自都发了大文章,教程在
主流的分析:scATAC基础分析、单细胞RNA和ATAC联合分析、ATAC的区域差异分析、ATAC的多样本联合分析、多组学分析(ATAC + RNA)。
大家只要玩明白其中的一个,单细胞联合ATAC分析就不是问题。
今天我们分享的是snapATAC2,2024年1月发表于Nature Methods。

Standard pipeline
import snapatac2 as snap
data = snap.pp.import_data(
fragment_file,
chrom_sizes=snap.genome.hg38,
file="pbmc.h5ad", # Optional
sorted_by_barcode=False,
)
snap.pp.filter_cells(data, min_counts=5000, min_tsse=10, max_counts=100000)
snap.pp.add_tile_matrix(data)
snap.pp.select_features(data, n_features=250000)
Doublet removal
snap.pp.scrublet(data)
snap.pp.filter_doublets(data)
Dimenstion reduction
snap.tl.spectral(data)
snap.tl.umap(data)
Clustering analysis
snap.pp.knn(data)
snap.tl.leiden(data)
snap.pl.umap(data, color='leiden', interactive=False, height=500)

Cell cluster annotation
Create the cell by gene activity matrix
gene_matrix = snap.pp.make_gene_matrix(data, snap.genome.hg38)
Imputation
import scanpy as sc
sc.pp.filter_genes(gene_matrix, min_cells= 5)
sc.pp.normalize_total(gene_matrix)
sc.pp.log1p(gene_matrix)
sc.external.pp.magic(gene_matrix, solver="approximate")
gene_matrix.obsm["X_umap"] = data.obsm["X_umap"]
单细胞RNA与ATAC的联合分析
import warnings
warnings.filterwarnings("ignore")
import snapatac2 as snap
import anndata as ad
import pandas as pd
import scanpy as sc
import scvi
import numpy as np
scvi.settings.seed = 0
snap.__version__
reference = snap.read(snap.datasets.pbmc10k_multiome(), backed=None)
atac = snap.read(snap.datasets.pbmc5k(type='h5ad'), backed=None)
query = snap.pp.make_gene_matrix(atac, gene_anno=snap.genome.hg38)
query.obs['cell_type'] = pd.NA
data = ad.concat(
[reference, query],
join='inner',
label='batch',
keys=["reference", "query"],
index_unique='_',
)
sc.pp.filter_genes(data, min_cells=5)
sc.pp.highly_variable_genes(
data,
n_top_genes = 5000,
flavor="seurat_v3",
batch_key="batch",
subset=True
)
Data integration
scvi.model.SCVI.setup_anndata(data, batch_key="batch")
vae = scvi.model.SCVI(
data,
n_layers=2,
n_latent=30,
gene_likelihood="nb",
dispersion="gene-batch",
)
vae.train(max_epochs=1000, early_stopping=True)
data.obs["celltype_scanvi"] = 'Unknown'
ref_idx = data.obs['batch'] == "reference"
data.obs["celltype_scanvi"][ref_idx] = data.obs['cell_type'][ref_idx]
lvae = scvi.model.SCANVI.from_scvi_model(
vae,
adata=data,
labels_key="celltype_scanvi",
unlabeled_category="Unknown",
)
lvae.train(max_epochs=1000, n_samples_per_label=100)
data.obs["C_scANVI"] = lvae.predict(data)
data.obsm["X_scANVI"] = lvae.get_latent_representation(data)
sc.pp.neighbors(data, use_rep="X_scANVI")
sc.tl.umap(data)
sc.pl.umap(data, color=['C_scANVI', "batch"], wspace=0.45)
atac.obs['cell_type'] = data.obs.loc[atac.obs_names + '_query']['C_scANVI'].to_numpy()
sc.pl.umap(atac, color=['cell_type', "leiden"], wspace=0.45)

Identify differentially accessible regions
import snapatac2 as snap
import numpy as np
import polars as pl
data = snap.read(snap.datasets.pbmc5k(type="annotated_h5ad"), backed=None)
snap.pl.umap(data, color='cell_type', interactive=False)

Peak calling at the cluster-level
snap.tl.macs3(data, groupby='cell_type')
peaks = snap.tl.merge_peaks(data.uns['macs3'], snap.genome.hg38)
peak_mat = snap.pp.make_peak_matrix(data, use_rep=peaks['Peaks'])
Finding marker regions
marker_peaks = snap.tl.marker_regions(peak_mat, groupby='cell_type', pvalue=0.01)
snap.pl.regions(peak_mat, groupby='cell_type', peaks=marker_peaks, interactive=False)

%%time
motifs = snap.tl.motif_enrichment(
motifs=snap.datasets.cis_bp(unique=True),
regions=marker_peaks,
genome_fasta=snap.genome.hg38,
)
snap.pl.motif_enrichment(motifs, max_fdr=0.0001, height=1600, interactive=False)

Regression-based differential test
group1 = "Naive B"
group2 = "Memory B"
naive_B = data.obs['cell_type'] == group1
memory_B = data.obs['cell_type'] == group2
peaks_selected = np.logical_or(
peaks[group1].to_numpy(),
peaks[group2].to_numpy(),
)
%%time
diff_peaks = snap.tl.diff_test(
peak_mat,
cell_group1=naive_B,
cell_group2=memory_B,
features=peaks_selected,
)
diff_peaks = diff_peaks.filter(pl.col('adjusted p-value') < 0.01)
diff_peaks.head()

snap.pl.regions(
peak_mat,
groupby = 'cell_type',
peaks = {
group1: diff_peaks.filter(pl.col("log2(fold_change)") > 0)['feature name'].to_numpy(),
group2: diff_peaks.filter(pl.col("log2(fold_change)") < 0)['feature name'].to_numpy(),
},
interactive = False,
)

barcodes = np.array(data.obs_names)
background = []
for i in np.unique(data.obs['cell_type']):
if i != group2:
cells = np.random.choice(barcodes[data.obs['cell_type'] == i], size=30, replace=False)
background.append(cells)
background = np.concatenate(background)
diff_peaks = snap.tl.diff_test(
peak_mat,
cell_group1=memory_B,
cell_group2=background,
features=peaks[group2].to_numpy(),
direction="positive",
)
diff_peaks = diff_peaks.filter(pl.col('adjusted p-value') < 0.01)
diff_peaks.head()

snap.pl.regions(
peak_mat,
groupby='cell_type',
peaks={ group2: diff_peaks['feature name'].to_numpy() },
interactive=False,
)

Multi-sample Pipeline: analyzing snATAC-seq data of human colon samples
import snapatac2 as snap
import numpy as np
files = snap.datasets.colon()
adatas = snap.pp.import_data(
[fl for _, fl in files],
file=[name + '.h5ad' for name, _ in files],
chrom_sizes=snap.genome.hg38,
min_num_fragments=1000,
)
snap.metrics.tsse(adatas, snap.genome.hg38)
snap.pp.filter_cells(adatas, min_tsse=7)
snap.pp.add_tile_matrix(adatas, bin_size=5000)
snap.pp.select_features(adatas, n_features=50000)
snap.pp.scrublet(adatas)
snap.pp.filter_doublets(adatas)
data = snap.AnnDataSet(
adatas=[(name, adata) for (name, _), adata in zip(files, adatas)],
filename="colon.h5ads"
)
unique_cell_ids = [sa + ':' + bc for sa, bc in zip(data.obs['sample'], data.obs_names)]
data.obs_names = unique_cell_ids
assert data.n_obs == np.unique(data.obs_names).size
snap.pp.select_features(data, n_features=50000)
snap.tl.spectral(data)
snap.tl.umap(data)

Batch correction
snap.pp.mnc_correct(data, batch="sample")
snap.pp.harmony(data, batch="sample", max_iter_harmony=20)
snap.tl.umap(data, use_rep="X_spectral_mnn")
snap.pl.umap(data, color="sample", interactive=False)

snap.tl.umap(data, use_rep="X_spectral_harmony")
snap.pl.umap(data, color="sample", interactive=False)

聚类
snap.pp.knn(data, use_rep="X_spectral_harmony")
snap.tl.leiden(data)
snap.pl.umap(data, color="leiden", interactive=False)

Peak calling
snap.tl.macs3(data, groupby='leiden', replicate='sample')
merged_peaks = snap.tl.merge_peaks(data.uns['macs3'], chrom_sizes=snap.genome.hg38)
Multi-modality pipeline: analyzing single-cell multiome data (ATAC + Gene Expression)
import snapatac2 as snap
import scanpy as sc
rna = snap.read(snap.datasets.pbmc10k_multiome(modality='RNA'), backed=None)
sc.pp.highly_variable_genes(rna, flavor='seurat_v3', n_top_genes=3000)
rna = rna[:, rna.var.highly_variable]
sc.pp.normalize_total(rna, target_sum=1e4)
sc.pp.log1p(rna)
snap.tl.spectral(rna, features=None)
snap.tl.umap(rna)
snap.pl.umap(rna, color='cell_type', interactive=False, height=550)

Analyze chromatin accessibility data
atac = snap.read(snap.datasets.pbmc10k_multiome(modality='ATAC'), backed=None)
snap.tl.spectral(atac, features=None)
snap.tl.umap(atac)
snap.pl.umap(atac, color="cell_type", interactive=False, height=550

Perform joint embedding
assert (rna.obs_names == atac.obs_names).all()
embedding = snap.tl.multi_spectral([rna, atac], features=None)[1]
atac.obsm['X_joint'] = embedding
snap.tl.umap(atac, use_rep='X_joint')
snap.pl.umap(atac, color="cell_type", interactive=False, height=550)

生活很好,有你更好