利用biomart实现同源基因自动批量查找

世小代

已于 2024-07-28 00:32:21 修改

阅读量875

点赞数 10

文章标签： python

于 2024-07-27 16:45:22 首次发布

本文链接：https://blog.csdn.net/Yoshikirinda/article/details/140737595

版权

https://github.com/yoshirinda/find-homologs

已经封装为了可执行文件，欢迎尝试

Ensemble plants网站是少数提供了biomart的外部可访问api链接的数据库。

网站事先通过序列比对和进化树构建找到已收录的各物种的同源基因。

利用biomart python包可以直接访问http://plants.ensembl.org/biomart'

每个基因的注释中有对应的{species}_eg_homolog_ensembl_gene的属性，该属性对应了目的基因在各物种中的同源基因。其中物种名采取缩写形式，如玉米 zea mays 为zmays。

利用本机python运行只需两分钟就能完成拟南芥中16个基因在四个物种中的同源基因批量查询。

注意，要求数据库必须被ensemble网站收录。

代码如下，空闲的时候可以考虑将其封装为简单操作的软件。

import requests
from biomart import BiomartServer
import os

# 连接到Ensembl的Biomart服务器
server = BiomartServer('http://plants.ensembl.org/biomart')

# 选择拟南芥的数据集
mart = server.datasets['athaliana_eg_gene']

# 定义拟南芥基因列表
athaliana_genes = [
    'At2g47430', 'At1g09570', 'At2g18790', 'At1g66340', 'At1g04310', 
    'At3g23150', 'At2g40940', 'At5g10720', 'At2g01830', 'At1g27320', 
    'At5g35750', 'At2g17820', 'At3g04580', 'At4g16250', 'At5g35840',
    'At4g18130'  # 添加新的拟南芥基因
]

# 定义物种及其数据集名称
species_datasets = {
    'Zea Mays': 'zmays_eg_gene',
    'Galdieria sulphuraria': 'gsulphuraria_eg_gene',
    'Cyanidioschyzon merolae': 'cmerolae_eg_gene',
    'Chondrus crispus': 'ccrispus_eg_gene',
    'Ostreococcus lucimarinus': 'olucimarinus_eg_gene'
}

# 创建输出目录
output_dir = 'output_files'
os.makedirs(output_dir, exist_ok=True)

# 定义一个函数来查询蛋白质序列
def fetch_protein_sequences(dataset, gene_ids):
    response = dataset.search({
        'filters': {'ensembl_gene_id': gene_ids},
        'attributes': ['ensembl_gene_id', 'peptide']
    })
    sequences = []
    for row in response.iter_lines():
        columns = row.decode('utf-8').split('\t')
        if len(columns) == 2:
            sequences.append((columns[0], columns[1]))  # 确保基因ID和蛋白质序列正确对应
    return sequences

# 处理每个拟南芥基因
processed_sequences = {}
for athaliana_gene in athaliana_genes:
    print(f"正在处理基因: {athaliana_gene}")
    
    for species, dataset_name in species_datasets.items():
        homolog_attr = f"{dataset_name.split('_')[0]}_eg_homolog_ensembl_gene"
        
        # 构建查询，查询拟南芥基因的同源基因
        try:
            print(f"查询基因 {athaliana_gene} 在物种 {species} 中的同源基因...")
            response = mart.search({
                'filters': {'ensembl_gene_id': athaliana_gene},
                'attributes': [homolog_attr]
            })
            
            homolog_gene_ids = set()
            for row in response.iter_lines():
                columns = row.decode('utf-8').split('\t')
                if len(columns) == 1 and columns[0]:
                    homolog_gene_ids.add(columns[0])
            
            if homolog_gene_ids:
                # 保存同源基因ID到文件
                with open(os.path.join(output_dir, f"{species}_homolog_gene_ids.txt"), 'a') as f:
                    f.write(f"{athaliana_gene}\t{species}\t{','.join(homolog_gene_ids)}\n")
                
                print(f"基因 {athaliana_gene} 在物种 {species} 中的同源基因: {homolog_gene_ids}")
                
                # 查询蛋白质序列
                sequences = fetch_protein_sequences(server.datasets[dataset_name], list(homolog_gene_ids))
                
                # 保存蛋白质序列到字典，避免重复输出
                for gene_id, seq in sequences:
                    if gene_id not in processed_sequences:
                        processed_sequences[gene_id] = seq
                    else:
                        # 如果已经存在相同名称的序列，保存较长的那条
                        if len(seq) > len(processed_sequences[gene_id]):
                            processed_sequences[gene_id] = seq
                
                print(f"基因 {athaliana_gene} 的蛋白质序列已保存到字典")
            else:
                print(f"基因 {athaliana_gene} 在物种 {species} 中没有找到同源基因")
        
        except Exception as e:
            print(f"查询基因 {athaliana_gene} 在物种 {species} 中的同源基因时出错: {e}")

# 最终输出到FASTA文件
with open(os.path.join(output_dir, f"all_species_protein_sequences.fasta"), 'w') as f:
    for gene_id, seq in processed_sequences.items():
        f.write(f">{gene_id}\n{seq}\n")

print(f"所有基因的同源基因ID和蛋白质序列已保存到文件")