import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
GCcounttxt 文件,5列,“chrom”,“start”,“end”,“区间内gc比例”,“reads”
bed文件
画柱状图,曲线图,以及注释
def draw_pic(GCcounttxt):
# GCcounttxt 5列,"chrom","start","end","gc","reads"
sampleid = GCcounttxt.split(".")[0]
data = pd.read_table(GCcounttxt, names=["chrom","start","end","gc","reads"])
datasort = data.sort_values(by=["gc"])
gc = list(datasort["gc"])
reads = list(datasort["reads"])
bin_num = np.arange(len(datasort))
#############################################################
annobed = pd.read_table("anno.bed", names=["chrom","start","end","gene"])
mergedata = pd.merge(datasort, annobed, on=["chrom","start","end"])
mergedata["gene"] = mergedata["gene"].apply(lambda x:x.split(",")[0]) #只保留第一个基因
mergedata = mergedata.loc[mergedata["gene"] !="-"]
group_gene = mergedata.groupby('gene')
dict1={"gene":[], "gc_gene":[], "read_gene":[], "gene_len":[]}
for gene , group in group_gene:
gc_gene = group["gc"].mean()
read_gene = group["reads"].sum()
group["bed_len"] = group["end"]- group["start"]
gene_len = group["bed_len"].sum()
read_gene2 = read_gene / gene_len # 标准化基因覆盖的reads
dict1["gene"].append(gene)
dict1["gc_gene"].append(gc_gene)
dict1["read_gene"].append(read_gene2)
dict1["gene_len"].append(gene_len)
gene_frame = pd.DataFrame(dict1)
gene_frame = gene_frame.sort_values(by=["gc_gene"]) # 按照基因gc含量排序
gene_frame = gene_frame.reset_index(drop=True)
out = "%s.gene.txt"%sampleid
gene_frame.to_csv(out,sep="\t",index=False)
###############################################################
gene_num = np.arange(len(gene_frame["gene"]))
gc = list(gene_frame["gc_gene"])
reads = list(gene_frame["read_gene"])
CEBPA = gene_frame.loc[gene_frame["gene"]=="CEBPA"]
index1 = list(CEBPA.index)[0] #CEBPA 基因横坐标
gc1 = list(CEBPA["gc_gene"])[0] # gc含量 ,左y轴坐标
read_gene1 = list(CEBPA["read_gene"])[0] # reads数 ,右y轴坐标
NPM1 = gene_frame.loc[gene_frame["gene"]=="NPM1"]
index2 = list(NPM1.index)[0]
gc2 = list(NPM1["gc_gene"])[0]
read_gene2 = list(NPM1["read_gene"])[0]
print(NPM1)
print(CEBPA)
with PdfPages("%s.pdf"%sampleid) as pdf:
fig,ax1 = plt.subplots(figsize=(21,10))
plt.title(sampleid)
ax1.plot(gene_num, gc,alpha=1, color="g", linestyle='-', linewidth=3.0) # alpha 透明度,color 柱子颜色;linestyle 线型 ,linewidth 线粗细, s= 10 点的大小
ax1.set_xlabel('gene', size =18) # x轴标签
ax1.set_ylabel('gc(%)', color = 'g',size=18) # y轴标签
plt.annotate("CEBPA\n gc:%.4f"%gc1 ,
xy=(index1, gc1),
xytext=(index1-1000, 0.8),
weight="bold",
color='r',
size='15',
arrowprops=dict(arrowstyle="->",connectionstyle="arc3",color="r"))
# 对所选基因注释,在图中标出基因信息 xy为 基因横纵坐标位置 ,xytext 为注释的文本信息显示位置坐标,weight 设置字体线型 ,arrowprops dict格式,设置指向箭头的参数,字典中key值有①arrowstyle:设置箭头的样式,其value候选项如’->’,’|-|’,’-|>’,也可以用字符串’simple’,‘fancy’等 ; ②connectionstyle:设置箭头的形状,为直线或者曲线,候选项有’arc3’,‘arc’,‘angle’,‘angle3’,可以防止箭头被曲线内容遮挡
plt.annotate("NPM1\n gc:%.4f"%gc2 ,
xy=(index2, gc2),
xytext=(index2+1000, 0.6),
weight="bold",
color='r',
size='15',
arrowprops=dict(arrowstyle="->",connectionstyle="arc3",color="r"))
ax2 = ax1.twinx() # 设置第二坐标轴 twinx 设置第二纵坐标 twiny设置第二横坐标。 这里貌似必须先注释完第一个后才能写, 如果在注释gc前写入,注释时写入的坐标是以第二坐标轴为准的。所以以第一y轴为准的注释应该先注释,然后再写ax1.twinx()
ax2.scatter(gene_num, reads, alpha=0.4, color="b", linestyle='--', linewidth=1.0)
ax2.set_ylim(0,10) # y轴值 区间
ax2.set_ylabel('read_count', color = 'b',size=18)
plt.annotate("CEBPA\nread:%.4f"%read_gene1 ,
xy=(index1, read_gene1),
xytext=(index1-3000, 0.3),
weight="bold",
color='r',
size='15',
arrowprops=dict(arrowstyle="->",connectionstyle="arc3",color="r"))
plt.annotate("NPM1\nread:%.4f"%read_gene2 ,
xy=(index2, read_gene2),
xytext=(index2+1000, 1),
weight="bold",
color='r',
size='15',
arrowprops=dict(arrowstyle="->",connectionstyle="arc3",color="r"))
plt.show()
pdf.savefig(fig) # 保存图片在pdf
plt.close()
draw_pic(GCcounttxt)
box图
import seaborn as sns
import matplotlib.pyplot as plt
fig1=plt.figure(figsize=(24, 10))
for i in range(6):
i+=1
frame=pd.read_table(f'Region{i}.xls') # frame包含需要画图的特征列,以及特征的分组列
plt.subplot(2,3,i) # 画布上子图为2行3列,6个小图,i为指定的第i个子图
plt.title(f'Region{i}')
sns.boxplot(x='clinic_label',y="Methylation Level",data=frame,width=0.3) # x为分类标签,y为画图的特征值
sns.swarmplot(x='clinic_label',y="Methylation Level", data=df, color="grey",size=4) # 在Box图中添加散点
#frame.to_csv('Region1.xls',index=False,sep='\t')
plt.tight_layout()
plt.show()
如果将多个fig1保存为pdf
figlists=[]
将fig1保存在figlists
with PdfPages("result.pdf") as pdf:
for fig in figlists:
pdf.savefig(fig)
画显著性比较
frame格式如图
from statannotations.Annotator import Annotator
for i in ['VIM_dCT','ONECUT2_dCT','TERT-M1_dCT','FGFR3-M1_dCT','TERT-M2_dCT','FGFR3-M2_dCT']:
fig1=plt.figure(figsize=(20, 10))
plt.title(i, fontsize=18)
ax = sns.boxplot(x='label',y=i,data=concatframe,width=0.3)
sns.swarmplot(x='label',y=i, data=concatframe, color="grey",size=4)
plt.ylabel(i, size =18)
#plt.xticks(fontsize=17)
plt.xlabel('')
plt.axhline(y=5, color='r', linestyle='--') # 在box图上y=5处画一条横线
pairs=[("Case", "Benign_disease"), ("Case", "Interference"), ("Benign_disease", "Interference")]
annotator = Annotator(ax, pairs, data=concatframe, x='label', y=i) # x为标签列,y为数值列
annotator.configure(test='Mann-Whitney', text_format='star',line_height=0.03,line_width=1)
annotator.apply_and_annotate()
plt.show()