cellranger VDJ 数据过滤

39 篇文章 1 订阅
13 篇文章 2 订阅
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("/mnt/g/20220309-scBCR/HY01-1F11_ALL.csv",sep=",",low_memory=False)
df
ndatasetsoriginsdonorsentropy_cellnearfardrefdref_aaext...group_idgroup_ncellsclonotype_idclonotype_ncellsnchainsexact_subclonotype_idbarcodesHY01-1F11_barcodesbarcodeHY01-1F11_barcode
059HY01-1F11s1d10.0NaNNaN146NaN...114771147731AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...AAAGCAAAGTACGCGA-1AAAGCAAAGTACGCGA-1
159HY01-1F11s1d10.0NaNNaN146NaN...114771147731AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...AACACGTGTTGGTTTG-1AACACGTGTTGGTTTG-1
259HY01-1F11s1d10.0NaNNaN146NaN...114771147731AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...ACACCGGGTACAGTGG-1ACACCGGGTACAGTGG-1
359HY01-1F11s1d10.0NaNNaN146NaN...114771147731AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...ACAGCCGAGAATCTCC-1ACAGCCGAGAATCTCC-1
459HY01-1F11s1d10.0NaNNaN146NaN...114771147731AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...AAAGCAAAGTACGCGA-1,AACACGTGTTGGTTTG-1,ACACCGGG...ACCCACTTCGCCGTGA-1ACCCACTTCGCCGTGA-1
..................................................................
120161HY01-1F11s1d10.0NaNNaN00NaN...330011121ACGCAGCTCCAAGCCG-1ACGCAGCTCCAAGCCG-1ACGCAGCTCCAAGCCG-1ACGCAGCTCCAAGCCG-1
120171HY01-1F11s1d10.0NaNNaN20NaN...330111121GCGCGATTCTGCGGCA-1GCGCGATTCTGCGGCA-1GCGCGATTCTGCGGCA-1GCGCGATTCTGCGGCA-1
120181HY01-1F11s1d10.0NaNNaN76NaN...330211121AGAATAGTCGCTGATA-1AGAATAGTCGCTGATA-1AGAATAGTCGCTGATA-1AGAATAGTCGCTGATA-1
120191HY01-1F11s1d10.0NaNNaN44NaN...330311131AGTGTCATCACTTATC-1AGTGTCATCACTTATC-1AGTGTCATCACTTATC-1AGTGTCATCACTTATC-1
120201HY01-1F11s1d10.0NaNNaN86NaN...330411121GTGTTAGTCTAACTGG-1GTGTTAGTCTAACTGG-1GTGTTAGTCTAACTGG-1GTGTTAGTCTAACTGG-1

12021 rows × 380 columns

df1 = df.drop_duplicates(subset=["group_id","exact_subclonotype_id"],keep="first")
df2 = df1[(df1.nchains >= 2) & (df1.group_ncells >= 2)]
df3 = df2.loc[:,['group_id', 'group_ncells', 'exact_subclonotype_id', 'n', 'barcodes', 'nchains', 'u1', 'r1', 'v_name1', 'd_name1', 'j_name1','const1', 'cdr3_aa1', 'cdr3_dna1', 'fwr1_aa1', 'fwr1_dna1', 'cdr1_aa1', 'cdr1_dna1', 'fwr2_aa1', 'fwr2_dna1', 'cdr2_aa1', 'cdr2_dna1', 'fwr3_aa1', 'fwr3_dna1', 'fwr4_aa1', 'fwr4_dna1', 'vj_aa_nl1', 'vj_seq_nl1', 'u2', 'r2', 'v_name2', 'd_name2', 'j_name2','const2', 'cdr3_aa2', 'cdr3_dna2', 'fwr1_aa2', 'fwr1_dna2', 'cdr1_aa2', 'cdr1_dna2', 'fwr2_aa2', 'fwr2_dna2', 'cdr2_aa2', 'cdr2_dna2', 'fwr3_aa2', 'fwr3_dna2', 'fwr4_aa2', 'fwr4_dna2', 'vj_aa_nl2', 'vj_seq_nl2', 'u3', 'r3', 'v_name3', 'd_name3', 'j_name3','const3', 'cdr3_aa3', 'cdr3_dna3', 'fwr1_aa3', 'fwr1_dna3', 'cdr1_aa3', 'cdr1_dna3', 'fwr2_aa3', 'fwr2_dna3', 'cdr2_aa3', 'cdr2_dna3', 'fwr3_aa3', 'fwr3_dna3', 'fwr4_aa3', 'fwr4_dna3', 'vj_aa_nl3', 'vj_seq_nl3', 'u4', 'r4', 'v_name4', 'd_name4', 'j_name4','const4', 'cdr3_aa4', 'cdr3_dna4', 'fwr1_aa4', 'fwr1_dna4', 'cdr1_aa4', 'cdr1_dna4', 'fwr2_aa4', 'fwr2_dna4', 'cdr2_aa4', 'cdr2_dna4', 'fwr3_aa4', 'fwr3_dna4', 'fwr4_aa4', 'fwr4_dna4', 'vj_aa_nl4', 'vj_seq_nl4']]
df3.to_csv("/mnt/g/20220309-scBCR/HY01-1F11_filter.csv",index=False)

cell count

print("过滤后共有 %d 个细胞被保留下来进行进一步分析"%np.sum(df3.n))
过滤后共有 9256 个细胞被保留下来进行进一步分析

clonotype count

print("过滤后共有 %d 个clonotye被保留下来进行进一步分析"%len(set(df3.group_id)))
过滤后共有 539 个clonotye被保留下来进行进一步分析
plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(df3[df3.group_id <=10].group_id, df3[df3.group_id <= 10].group_ncells)
ax.set_ylabel('num_cells')
ax.set_title('Top 10 Clonotype')
#ax.set_xticks(ind, labels=['G1', 'G2', 'G3', 'G4', 'G5'])
#ax.legend()
plt.show()

请添加图片描述

CDR3 lengths distribute

new_cols = {x: y for x, y in zip(df3.loc[:,["const2","cdr3_aa2","v_name2","j_name2","n"]].columns,df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]].columns)}
df_out = pd.concat([df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]],df3.loc[:,["const2","cdr3_aa2","v_name2","j_name2","n"]].rename(columns=new_cols)],ignore_index=True)

new_cols = {x: y for x, y in zip(df3.loc[:,["const3","cdr3_aa3","v_name3","j_name3","n"]].columns,df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]].columns)}
df_out = pd.concat([df_out,df3.loc[:,["const3","cdr3_aa3","v_name3","j_name3","n"]].rename(columns=new_cols)],ignore_index=True)

new_cols = {x: y for x, y in zip(df3.loc[:,["const4","cdr3_aa4","v_name4","j_name4","n"]].columns,df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]].columns)}
df_out = pd.concat([df_out,df3.loc[:,["const4","cdr3_aa4","v_name4","j_name4","n"]].rename(columns=new_cols)],ignore_index=True)

df_out = df_out.dropna(how="any")
df_out
const1cdr3_aa1v_name1j_name1n
0IGHG1CAPIHYDYGTWFAYWIGHV14-3IGHJ359
1IGHG1CAPISYDYGTWFAYWIGHV14-3IGHJ3201
2IGHG1CAPIHYDYGTWFAYWIGHV14-3IGHJ3174
3IGHG1CAPIYYDYGTWFAYWIGHV14-3IGHJ3173
4IGHG1CAPIHYDYGTWFAYWIGHV14-3IGHJ392
..................
5999IGKCCQQYWSTPYTFIGKV13-85IGKJ21
6001IGKCCQQYNSYPLTFIGKV6-15IGKJ51
6002IGKCCQQYNSYPLTFIGKV6-15IGKJ51
6006IGKCCQQYNSYPFTFIGKV6-15IGKJ42
6107IGLC1CALWYSTIWVFIGLV1IGLJ11

3254 rows × 5 columns

a = np.histogram([len(x) for x in df_out.cdr3_aa1],bins=np.arange(25))

plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(a[1][0:24],a[0])
ax.set_ylabel('num')
ax.set_xlabel('lengths')
#ax.set_title('CDR3 lengths distribute')
plt.show()

请添加图片描述

IGH CDR3 lengths distribute
df_out_H = df_out[[x.startswith('IGH') for x in df_out.const1]]
a = np.histogram([len(x) for x in df_out_H.cdr3_aa1],bins=np.arange(25))

plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(a[1][0:24],a[0])
ax.set_ylabel('num')
ax.set_xlabel('lengths')
#ax.set_title('CDR3 lengths distribute')
plt.show()

请添加图片描述

IGK & IGL CDR3 lengths distribute
df_out_L = df_out[[x.startswith('IGK') or x.startswith('IGL')  for x in df_out.const1]]
a = np.histogram([len(x) for x in df_out_L.cdr3_aa1],bins=np.arange(25))

plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(a[1][0:24],a[0])
ax.set_ylabel('num')
ax.set_xlabel('lengths')
#ax.set_title('CDR3 lengths distribute')
plt.show()

请添加图片描述

CDR3 abundance

df_out_2 = df_out.loc[:,["cdr3_aa1","n"]]
abu = df_out_2.groupby(df_out_2.cdr3_aa1).sum()
abu = abu.sort_values(by=["n"],ascending=False)
plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(np.arange(20),abu.n[0:20])
ax.set_xticklabels([])
plt.show()

请添加图片描述

V gene usage

重链中V gene使用频率
df_out_3 = df_out_H.loc[:,["v_name1","n"]]
abu = df_out_3.groupby(df_out_3.v_name1).sum()
abu = abu.sort_values(by=["n"],ascending=False)
abu
n
v_name1
IGHV1-191582
IGHV14-31514
IGHV1-691080
IGHV1-56980
IGHV2-5917
......
IGHV1-182
IGHV9-42
IGHV5-9-31
IGHV1-431
IGHV1-421

81 rows × 1 columns

plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(np.arange(20),abu.n[0:20])
ax.set_xticklabels([])
plt.show()

请添加图片描述

轻链中V gene使用频率
df_out_3 = df_out_L.loc[:,["v_name1","n"]]
abu = df_out_3.groupby(df_out_3.v_name1).sum()
abu = abu.sort_values(by=["n"],ascending=False)
abu
n
v_name1
IGKV8-302057
IGKV10-961550
IGKV12-461167
IGKV6-17955
IGKV8-27864
......
IGKV4-531
IGKV6-291
IGKV3-31
IGKV1-1321
IGLV31

75 rows × 1 columns

plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(np.arange(20),abu.n[0:20])
ax.set_xticklabels([])
plt.show()

请添加图片描述

J gene usage

重链中J gene使用频率
df_out_4 = df_out_H.loc[:,["j_name1","n"]]
abu = df_out_4.groupby(df_out_4.j_name1).sum()
abu = abu.sort_values(by=["n"],ascending=False)
abu
n
j_name1
IGHJ34409
IGHJ42169
IGHJ21739
IGHJ11233
plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(np.arange(len(abu.n)),abu.n[0:])
ax.set_xticklabels([])
plt.show()

请添加图片描述

轻链中J gene使用频率
df_out_4 = df_out_L.loc[:,["j_name1","n"]]
abu = df_out_4.groupby(df_out_4.j_name1).sum()
abu = abu.sort_values(by=["n"],ascending=False)
abu
n
j_name1
IGKJ25702
IGKJ12143
IGKJ51355
IGKJ4624
IGLJ124
IGLJ217
IGLJ33
plt.style.use('ggplot')
fig, ax = plt.subplots()

ax.bar(np.arange(len(abu.n)),abu.n[0:])
ax.set_xticklabels([])
plt.show()

请添加图片描述

V_J pairs

主要是看V gene和J gene联用频率
df_pair = df_out.groupby(["v_name1","j_name1"]).sum()
df_pair
n
v_name1j_name1
IGHV1-12IGHJ32
IGHV1-14IGHJ14
IGHJ227
IGHJ34
IGHJ433
.........
IGKV9-129IGKJ43
IGLV1IGLJ124
IGLJ33
IGLV2IGLJ216
IGLV3IGLJ21

361 rows × 1 columns

df_pair.sort_values(by="n",ascending=False)
n
v_name1j_name1
IGKV8-30IGKJ22041
IGHV1-19IGHJ31575
IGKV10-96IGKJ21520
IGHV14-3IGHJ31504
IGHV1-69IGHJ11015
.........
IGHV5-6-3IGHJ11
IGHV5-6-5IGHJ31
IGKV1-132IGKJ11
IGHV5-9-3IGHJ21
IGLV3IGLJ21

361 rows × 1 columns

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值