处理一个公开数据集,发现多个genebank号对应1个gene ID,所以写了个脚本在NCBI接口批处理了一下。同时为了避免被封,添加了sleep,不知道是否好用,结过是几万的数据依然很流畅~~
####### 通过 genebank的BC号获取ID和序列信息 #######
from Bio import Entrez
from Bio import SeqIO
import time
import random
import csv
from itertools import islice
def get_gene_id(genebank_accession):
Entrez.email = "*****@gmail.com"
M = random.randint(0, 3)
time.sleep(M)
# 使用Entrez获取GenBank记录
handle = Entrez.efetch(db="nucleotide", id=genebank_accession, rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# 获取基因ID
gene_id = None
protein_sequence = None
for feature in record.features:
if feature.type == "gene":
gene_id = feature.qualifiers["gene"][0]
if feature.type == "CDS":
protein_sequence = feature.qualifiers["translation"][0]
break
return gene_id, protein_sequence if protein_sequence is not None else "N/A"
file_path = 'genebank.csv'
with open(file_path, 'r') as infile, open('genseq.csv', 'a', newline='', encoding='gbk') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
# writer.writerow(['GeneBank Accession', 'Gene ID', 'Protein Sequence']) # 写入 CSV 文件的表头
next(reader) # 跳过 CSV 文件的表头
for row in islice(reader, 1000, 4000): # 仅处理前1000-4000行数据
genebank_accession = row[0] # 假设GeneBank存取号是行的第一个字段
try:
gene_id, protein_sequence = get_gene_id(genebank_accession)
print("genebank_accession",genebank_accession)
print("Gene ID:", gene_id)
print("Protein Sequence:", protein_sequence)
except Exception as e:
print(f"Failed to retrieve data for GeneBank Accession {genebank_accession}: {e}")
gene_id = "N/A"
protein_sequence = "N/A"
writer.writerow([genebank_accession, gene_id, protein_sequence])
print("############")