new_cols ={x: y for x, y inzip(df3.loc[:,["const2","cdr3_aa2","v_name2","j_name2","n"]].columns,df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]].columns)}
df_out = pd.concat([df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]],df3.loc[:,["const2","cdr3_aa2","v_name2","j_name2","n"]].rename(columns=new_cols)],ignore_index=True)
new_cols ={x: y for x, y inzip(df3.loc[:,["const3","cdr3_aa3","v_name3","j_name3","n"]].columns,df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]].columns)}
df_out = pd.concat([df_out,df3.loc[:,["const3","cdr3_aa3","v_name3","j_name3","n"]].rename(columns=new_cols)],ignore_index=True)
new_cols ={x: y for x, y inzip(df3.loc[:,["const4","cdr3_aa4","v_name4","j_name4","n"]].columns,df3.loc[:,["const1","cdr3_aa1","v_name1","j_name1","n"]].columns)}
df_out = pd.concat([df_out,df3.loc[:,["const4","cdr3_aa4","v_name4","j_name4","n"]].rename(columns=new_cols)],ignore_index=True)
df_out = df_out.dropna(how="any")
df_out
const1
cdr3_aa1
v_name1
j_name1
n
0
IGHG1
CAPIHYDYGTWFAYW
IGHV14-3
IGHJ3
59
1
IGHG1
CAPISYDYGTWFAYW
IGHV14-3
IGHJ3
201
2
IGHG1
CAPIHYDYGTWFAYW
IGHV14-3
IGHJ3
174
3
IGHG1
CAPIYYDYGTWFAYW
IGHV14-3
IGHJ3
173
4
IGHG1
CAPIHYDYGTWFAYW
IGHV14-3
IGHJ3
92
...
...
...
...
...
...
5999
IGKC
CQQYWSTPYTF
IGKV13-85
IGKJ2
1
6001
IGKC
CQQYNSYPLTF
IGKV6-15
IGKJ5
1
6002
IGKC
CQQYNSYPLTF
IGKV6-15
IGKJ5
1
6006
IGKC
CQQYNSYPFTF
IGKV6-15
IGKJ4
2
6107
IGLC1
CALWYSTIWVF
IGLV1
IGLJ1
1
3254 rows × 5 columns
a = np.histogram([len(x)for x in df_out.cdr3_aa1],bins=np.arange(25))
plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.bar(a[1][0:24],a[0])
ax.set_ylabel('num')
ax.set_xlabel('lengths')#ax.set_title('CDR3 lengths distribute')
plt.show()
IGH CDR3 lengths distribute
df_out_H = df_out[[x.startswith('IGH')for x in df_out.const1]]
a = np.histogram([len(x)for x in df_out_H.cdr3_aa1],bins=np.arange(25))
plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.bar(a[1][0:24],a[0])
ax.set_ylabel('num')
ax.set_xlabel('lengths')#ax.set_title('CDR3 lengths distribute')
plt.show()
IGK & IGL CDR3 lengths distribute
df_out_L = df_out[[x.startswith('IGK')or x.startswith('IGL')for x in df_out.const1]]
a = np.histogram([len(x)for x in df_out_L.cdr3_aa1],bins=np.arange(25))
plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.bar(a[1][0:24],a[0])
ax.set_ylabel('num')
ax.set_xlabel('lengths')#ax.set_title('CDR3 lengths distribute')
plt.show()
CDR3 abundance
df_out_2 = df_out.loc[:,["cdr3_aa1","n"]]
abu = df_out_2.groupby(df_out_2.cdr3_aa1).sum()
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltdf = pd.read_csv("/mnt/g/20220309-scBCR/HY01-1F11_ALL.csv",sep=",",low_memory=False)df n datasets origins donors entropy_cell ne