python实验3：读取压缩文件与pandas以及matplotlib.pyplot的学习

最新推荐文章于 2024-08-03 19:27:22 发布

小何同学#

最新推荐文章于 2024-08-03 19:27:22 发布

阅读量356

点赞数 5

文章标签： python 笔记经验分享

本文链接：https://blog.csdn.net/m0_69913686/article/details/139376173

版权

实验作业3-1

1.1 从文件gencode.v46.chr_patch_hapl_scaff.annotation.gff3.gz中提取type为gene的记录。

# 导入所需的库
import os
import pandas as pd

# 设置工作目录
os.chdir('D:/桌面/实验/python实验/python-class/')

# 读取压缩包数据
df = pd.read_csv('./gencode.v46.chr_patch_hapl_scaff.annotation.gff3.gz',
                 compression = 'gzip', sep = '\t', comment = '#', header = None, low_memory = False)

# 检查数据
df.head()
##       0       1  ...  7                                                  8
## 0  chr1  HAVANA  ...  .  ID=ENSG00000290825.1;gene_id=ENSG00000290825.1...
## 1  chr1  HAVANA  ...  .  ID=ENST00000456328.2;Parent=ENSG00000290825.1;...
## 2  chr1  HAVANA  ...  .  ID=exon:ENST00000456328.2:1;Parent=ENST0000045...
## 3  chr1  HAVANA  ...  .  ID=exon:ENST00000456328.2:2;Parent=ENST0000045...
## 4  chr1  HAVANA  ...  .  ID=exon:ENST00000456328.2:3;Parent=ENST0000045...
## 
## [5 rows x 9 columns]
pd.set_option('display.max_columns', None)
df.head()
##       0       1           2      3      4  5  6  7  \
## 0  chr1  HAVANA        gene  11869  14409  .  +  .   
## 1  chr1  HAVANA  transcript  11869  14409  .  +  .   
## 2  chr1  HAVANA        exon  11869  12227  .  +  .   
## 3  chr1  HAVANA        exon  12613  12721  .  +  .   
## 4  chr1  HAVANA        exon  13221  14409  .  +  .   
## 
##                                                    8  
## 0  ID=ENSG00000290825.1;gene_id=ENSG00000290825.1...  
## 1  ID=ENST00000456328.2;Parent=ENSG00000290825.1;...  
## 2  ID=exon:ENST00000456328.2:1;Parent=ENST0000045...  
## 3  ID=exon:ENST00000456328.2:2;Parent=ENST0000045...  
## 4  ID=exon:ENST00000456328.2:3;Parent=ENST0000045...
df[2]
## 0                gene
## 1          transcript
## 2                exon
## 3                exon
## 4                exon
##               ...    
## 3766027    transcript
## 3766028          exon
## 3766029          gene
## 3766030    transcript
## 3766031          exon
## Name: 2, Length: 3766032, dtype: object
# 赋予新的列名
col_names = ['chrid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
df.columns = col_names
df.head()
##   chrid  source        type  start    end score strand phase  \
## 0  chr1  HAVANA        gene  11869  14409     .      +     .   
## 1  chr1  HAVANA  transcript  11869  14409     .      +     .   
## 2  chr1  HAVANA        exon  11869  12227     .      +     .   
## 3  chr1  HAVANA        exon  12613  12721     .      +     .   
## 4  chr1  HAVANA        exon  13221  14409     .      +     .   
## 
##                                           attributes  
## 0  ID=ENSG00000290825.1;gene_id=ENSG00000290825.1...  
## 1  ID=ENST00000456328.2;Parent=ENSG00000290825.1;...  
## 2  ID=exon:ENST00000456328.2:1;Parent=ENST0000045...  
## 3  ID=exon:ENST00000456328.2:2;Parent=ENST0000045...  
## 4  ID=exon:ENST00000456328.2:3;Parent=ENST0000045...
# 提取type为gene的数据
df['type'].value_counts()
## type
## exon                                      1808998
## CDS                                        980651
## transcript                                 278220
## three_prime_UTR                            226422
## five_prime_UTR                             190230
## start_codon                                109068
## stop_codon                                 101701
## gene                                        70611
## stop_codon_redefined_as_selenocysteine        131
## Name: count, dtype: int64
df_gene = df[df['type'] == 'gene']
df_gene.head()
##    chrid   source  type  start    end score strand phase  \
## 0   chr1   HAVANA  gene  11869  14409     .      +     .   
## 5   chr1   HAVANA  gene  12010  13670     .      +     .   
## 13  chr1   HAVANA  gene  14696  24886     .      -     .   
## 25  chr1  ENSEMBL  gene  17369  17436     .      -     .   
## 28  chr1   HAVANA  gene  29554  31109     .      +     .   
## 
##                                            attributes  
## 0   ID=ENSG00000290825.1;gene_id=ENSG00000290825.1...  
## 5   ID=ENSG00000223972.6;gene_id=ENSG00000223972.6...  
## 13  ID=ENSG00000227232.6;gene_id=ENSG00000227232.6...  
## 25  ID=ENSG00000278267.1;gene_id=ENSG00000278267.1...  
## 28  ID=ENSG00000243485.5;gene_id=ENSG00000243485.5...

1.2 将chrid、start、end、gene_id、gene_type、gene_name(不包括“gene_id=”、“gene_type=”或“gene_name=”部分)和利用start和end计算得到的各基因长度length，整理为一个名为gdf的DataFrame(字段作为列名)。

# 合并
gdf = pd.concat([df_gene, df_gene['attributes'].str.split(';',expand=True)], axis = 1)

# 单独删去attributes列
del gdf['attributes']
# 批量删去source、type、strand、phase列
gdf.drop(columns = ['source', 'type', 'score', 'strand', 'phase', 0, 4, 5, 6, 7], inplace = True)
gdf.head()
##    chrid  start    end                          1  \
## 0   chr1  11869  14409  gene_id=ENSG00000290825.1   
## 5   chr1  12010  13670  gene_id=ENSG00000223972.6   
## 13  chr1  14696  24886  gene_id=ENSG00000227232.6   
## 25  chr1  17369  17436  gene_id=ENSG00000278267.1   
## 28  chr1  29554  31109  gene_id=ENSG00000243485.5   
## 
##                                                2                      3  
## 0                               gene_type=lncRNA      gene_name=DDX11L2  
## 5   gene_type=transcribed_unprocessed_pseudogene      gene_name=DDX11L1  
## 13              gene_type=unprocessed_pseudogene       gene_name=WASH7P  
## 25                               gene_type=miRNA    gene_name=MIR6859-1  
## 28                              gene_type=lncRNA  gene_name=MIR1302-2HG
# 重新赋予新的列名
gdf.columns = ['chrid', 'start', 'end', 'gene_id', 'gene_type', 'gene_name']

# 用等号拆分并保留右半部分
gdf['gene_id'] = [item.split('=')[1] for item in gdf['gene_id']]
gdf['gene_type'] = [item.split('=')[1] for item in gdf['gene_type']]
gdf['gene_name'] = [item.split('=')[1] for item in gdf['gene_name']]
gdf.head()
##    chrid  start    end            gene_id                           gene_type  \
## 0   chr1  11869  14409  ENSG00000290825.1                              lncRNA   
## 5   chr1  12010  13670  ENSG00000223972.6  transcribed_unprocessed_pseudogene   
## 13  chr1  14696  24886  ENSG00000227232.6              unprocessed_pseudogene   
## 25  chr1  17369  17436  ENSG00000278267.1                               miRNA   
## 28  chr1  29554  31109  ENSG00000243485.5                              lncRNA   
## 
##       gene_name  
## 0       DDX11L2  
## 5       DDX11L1  
## 13       WASH7P  
## 25    MIR6859-1  
## 28  MIR1302-2HG
# 用end-start得到length，并作为新列
gdf['length'] = gdf['end'] - gdf['start']
gdf['length']
## 0           2540
## 5           1660
## 13         10190
## 25            67
## 28          1555
##            ...  
## 3765934     2403
## 3765941     5898
## 3765957    23770
## 3766026      105
## 3766029      175
## Name: length, Length: 70611, dtype: int64
gdf['length'].describe()
## count    7.061100e+04
## mean     3.033053e+04
## std      8.773598e+04
## min      7.000000e+00
## 25%      5.900000e+02
## 50%      3.518000e+03
## 75%      2.232700e+04
## max      2.473538e+06
## Name: length, dtype: float64
gdf.head()
##    chrid  start    end            gene_id                           gene_type  \
## 0   chr1  11869  14409  ENSG00000290825.1                              lncRNA   
## 5   chr1  12010  13670  ENSG00000223972.6  transcribed_unprocessed_pseudogene   
## 13  chr1  14696  24886  ENSG00000227232.6              unprocessed_pseudogene   
## 25  chr1  17369  17436  ENSG00000278267.1                               miRNA   
## 28  chr1  29554  31109  ENSG00000243485.5                              lncRNA   
## 
##       gene_name  length  
## 0       DDX11L2    2540  
## 5       DDX11L1    1660  
## 13       WASH7P   10190  
## 25    MIR6859-1      67  
## 28  MIR1302-2HG    1555

1.3 将gdf以Tab分隔的形式存储到文件gene.txt中(文件中不保留行标签)。

# 保存文件
gdf.to_csv('gdf.txt', index=False, encoding='utf-8')

实验作业3-2

2.1 对不同gene_type，统计各类型length的分布，绘制箱型图，以300pi保存为pdf文件。

import matplotlib.pyplot as plt

# type过多选前十种
top_type = gdf['gene_type'].value_counts()[:10].index.tolist()
box_data = gdf[gdf['gene_type'].isin(top_type)][['gene_type', 'length']]

# 绘制箱线图
box_data.boxplot(column='length', by='gene_type', showfliers=False, widths=0.5)
# 将y轴的比例设置为对数尺度，以更好地表示数据的分布
plt.yscale('log')
# 设置x轴标签
plt.xlabel('gene_type', fontsize=12)
# 设置y轴标签
plt.ylabel('length', fontsize=12)
# 设置x轴刻度标签的旋转角度为15度，并且设置字体和字号
plt.xticks(rotation=15, fontproperties='Times New Roman', size=6)
## (array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), [Text(1, 0, 'TEC'), Text(2, 0, 'lncRNA'), Text(3, 0, 'miRNA'), Text(4, 0, 'misc_RNA'), Text(5, 0, 'processed_pseudogene'), Text(6, 0, 'protein_coding'), Text(7, 0, 'snRNA'), Text(8, 0, 'snoRNA'), Text(9, 0, 'transcribed_unprocessed_pseudogene'), Text(10, 0, 'unprocessed_pseudogene')])
# 根据图形元素自动调整子图参数，以防止x轴标签或标题遮挡
plt.tight_layout()
# 删除pandas自动生成的标题
plt.suptitle('')
# 设置标题
plt.title('Box Plot of Gene Lengths by Gene Type', fontsize=14)
# 以300pi保存为pdf文件
plt.savefig('Box Plot of Gene Lengths by Gene Type.pdf', dpi=300)
# 箱线图展示
plt.show()

箱线图

2.2 对不同gene_type，统计数目最多的5种，绘制饼图，以300pi保存为pdf文件。

import matplotlib.pyplot as plt

# 统计数目最多的5种gene_type及其数目
pie_data = pd.DataFrame({
    'gene_type':gdf['gene_type'].value_counts()[:5].index.tolist(),
    'counts':gdf['gene_type'].value_counts()[:5].tolist()
})

# 提取 'gene_type' 和 'counts' 作为饼图的标签和数据
labels = pie_data['gene_type']
sizes = pie_data['counts']

# 绘制饼图
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
## ([<matplotlib.patches.Wedge object at 0x0000017152A96B90>, <matplotlib.patches.Wedge object at 0x000001714E282710>, <matplotlib.patches.Wedge object at 0x000001714E29ED10>, <matplotlib.patches.Wedge object at 0x000001714E283ED0>, <matplotlib.patches.Wedge object at 0x0000017152A1C550>], [Text(0.39635091237930165, 1.026112057358306, 'protein_coding'), Text(-1.0389360288264298, -0.36140272274343493, 'lncRNA'), Text(0.4278052340150467, -1.0134015402343393, 'processed_pseudogene'), Text(1.0002724817856155, -0.45766249811672977, 'unprocessed_pseudogene'), Text(1.091392537231306, -0.13734019686825963, 'misc_RNA')], [Text(0.21619140675234635, 0.5596974858318032, '38.3%'), Text(-0.5666923793598707, -0.1971287578600554, '34.1%'), Text(0.2333483094627527, -0.5527644764914578, '17.9%'), Text(0.545603171883063, -0.24963408988185257, '5.7%'), Text(0.595305020307985, -0.07491283465541433, '4.0%')])
# 保证饼图为圆形
plt.axis('equal')
## (-1.0999974729173647, 1.0999998796627317, -1.0999974290383818, 1.099994675423214)
# 设置标题
plt.title('Pie Plot of Percentage by Top 5 Gene Types')  
# 以300pi保存为pdf文件
plt.savefig('Pie Plot of Percentage by Top 5 Gene Types.pdf', dpi=300)
# 饼图展示
plt.show()