从结果文件“abspres1.csv”提取blast_hit上下游10kb碱基序列:
from Bio import Entrez, SeqIO
import pandas as pd
import time
# 设置Entrez邮箱和API密钥
Entrez.email = 'your_email'
Entrez.api_key = "your_apikey"
# 设置网络 robustness
Entrez.max_tries = 5
Entrez.sleep_between_tries = 180
Entrez.timeout = 10
# 变量名常量
NCBI_ACCESSION_COLUMN_NAME = 'Scaffold'
START_COLUMN_NAME = 'Start'
END_COLUMN_NAME = 'End'
MIN_SEQUENCE_LENGTH = 1
MAX_SEQUENCE_LENGTH = float('inf')
SEQUENCE_NAME_TEMPLATE = '{accession}_{start}_{end}.fasta'
MAX_RETRIES = 3
RETRY_SLEEP_TIME = 5 # seconds
def main():
"""主函数。"""
print("Reading data.csv")
# 读取 CSV 文件
df = pd.read_csv('abspres1.csv')
# 遍历表格,获取序列并保存为FASTA文件
for index, row in df.iterrows():
accession = row[NCBI_ACCESSION_COLUMN_NAME]
start, end = get_sequence_range(row)
print(f"Downloading sequence for {accession}_{start}_{end}")
try:
sequence = download_sequence(accession, start, end)
if sequence is None:
print(f"Failed to download {accession}_{start}_{end}.fasta after {MAX_RETRIES} retries")
continue
save_sequence(sequence, accession, start, end)
print(f"Saved {accession}_{start}_{end}.fasta ({len(sequence)} bp)")
except Exception as e:
print(f"Error processing {accession}_{start}_{end}.fasta: {e}")
continue
def get_sequence_range(row):
"""计算序列范围,上下游10kb。"""
start = max(MIN_SEQUENCE_LENGTH, row[START_COLUMN_NAME] - 10000)
end = min(get_sequence_length(row), row[END_COLUMN_NAME] + 10000)
return start, end
def get_sequence_length(row):
"""获取序列的长度。"""
accession = row[NCBI_ACCESSION_COLUMN_NAME]
try:
with Entrez.efetch(db='nuccore', id=accession, rettype='fasta', retmode='text') as handle:
record = SeqIO.read(handle, 'fasta')
return len(record)
except Exception as e:
print(f"Error fetching length for {accession}: {e}")
return None
def download_sequence(accession, start, end):
"""从NCBI下载序列。"""
for i in range(MAX_RETRIES):
try:
with Entrez.efetch(db='nuccore', id=accession, rettype='fasta', retmode='text') as handle:
record = SeqIO.read(handle, 'fasta')
sequence = record.seq[start - 1: end]
return sequence
except Exception as e:
print(f"Error downloading {accession}_{start}_{end}.fasta on attempt {i+1}: {e}")
if i == MAX_RETRIES - 1:
return None
time.sleep(RETRY_SLEEP_TIME)
def save_sequence(sequence, accession, start, end):
"""将序列保存为FASTA文件。"""
filename = f"{accession}_{start}_{end}.fasta"
#print(f"Saving sequence as {filename}")
with open(filename, 'w') as f:
f.write(f">{filename}\n{sequence}\n")
if __name__ == '__main__':
main()
从结果文件“abspres1.csv”提取blast_hit对应的物种分类学信息,种和属:
from Bio import Entrez, SeqIO
import pandas as pd
import time
import requests
from ete3 import NCBITaxa
# 设置Entrez邮箱和API密钥
Entrez.email = "your_email"
Entrez.api_key = "your_apikey"
# 设置网络鲁棒性
Entrez.max_tries = 5
Entrez.sleep_between_tries = 15
Entrez.timeout = 10
# 读取表格数据
df = pd.read_csv('abspres1-re.csv')
# 初始化 NCBI Taxonomy 数据库
ncbi = NCBITaxa()
# 获取序列信息
def get_seq_info(seq_id):
try:
handle = Entrez.efetch(db="nuccore", id=seq_id, rettype="gb", retmode="text")
records = SeqIO.parse(handle, "genbank")
record = next(records)
handle.close()
return record.annotations.get("organism"), record.annotations.get("taxonomy")
except (requests.exceptions.RequestException, StopIteration) as e:
print(f"Error fetching sequence {seq_id}: {e}")
return None, None
# 创建门分类字典和属分类字典
phylum_dict = {}
genus_dict = {}
# 遍历所有序列,并根据物种信息进行分类
for i, seq_id in enumerate(df.iloc[:, 1]):
species_name, taxonomy = get_seq_info(seq_id)
# 如果获取不到物种信息,则跳过该序列
if species_name is None:
print(f"Skipping sequence {seq_id} due to missing species info")
continue
# 通过物种名称获取 NCBI Taxonomic ID
name2taxid = ncbi.get_name_translator([species_name])
taxid = str(name2taxid.values()).strip('dict_values([])')
phylum_name = None
genus_name = None
# 获取 NCBI 序列记录的分类信息
if taxid:
lineage = ncbi.get_lineage(taxid)
ranks = ncbi.get_rank(lineage)
names = ncbi.get_taxid_translator(lineage)
# 查找 genus
if 'genus' in ranks.values() and 'phylum' in ranks.values():
genus_taxids = [taxid for taxid, rank in ranks.items() if rank == 'genus']
genus_name = ncbi.get_taxid_translator(genus_taxids).get(genus_taxids[0])
phylum_taxids = [taxid for taxid, rank in ranks.items() if rank == 'phylum']
phylum_name = ncbi.get_taxid_translator(phylum_taxids).get(phylum_taxids[0])
print(f"Phylum: {phylum_name}-Genus: {genus_name}")
elif 'family' in ranks.values() and 'phylum' in ranks.values():
genus_taxids = [taxid for taxid, rank in ranks.items() if rank == 'family']
genus_name = ncbi.get_taxid_translator(genus_taxids).get(genus_taxids[0])
phylum_taxids = [taxid for taxid, rank in ranks.items() if rank == 'phylum']
phylum_name = ncbi.get_taxid_translator(phylum_taxids).get(phylum_taxids[0])
print(f"Phylum: {phylum_name}-Family: {genus_name}")
elif 'order' in ranks.values() and 'phylum' in ranks.values():
genus_taxids = [taxid for taxid, rank in ranks.items() if rank == 'order']
genus_name = ncbi.get_taxid_translator(genus_taxids).get(genus_taxids[0])
phylum_taxids = [taxid for taxid, rank in ranks.items() if rank == 'phylum']
phylum_name = ncbi.get_taxid_translator(phylum_taxids).get(phylum_taxids[0])
print(f"Phylum: {phylum_name}-Order: {genus_name}")
elif 'class' in ranks.values() and 'phylum' in ranks.values():
genus_taxids = [taxid for taxid, rank in ranks.items() if rank == 'class']
genus_name = ncbi.get_taxid_translator(genus_taxids).get(genus_taxids[0])
phylum_taxids = [taxid for taxid, rank in ranks.items() if rank == 'phylum']
phylum_name = ncbi.get_taxid_translator(phylum_taxids).get(phylum_taxids[0])
print(f"Phylum: {phylum_name}-Class: {genus_name}")
elif 'clade' in ranks.values() and 'phylum' in ranks.values():
genus_taxids = [taxid for taxid, rank in ranks.items() if rank == 'clade']
genus_name = ncbi.get_taxid_translator(genus_taxids).get(genus_taxids[0])
phylum_taxids = [taxid for taxid, rank in ranks.items() if rank == 'phylum']
phylum_name = ncbi.get_taxid_translator(phylum_taxids).get(phylum_taxids[0])
print(f"Phylum: {phylum_name}-Clade: {genus_name}")
elif 'phylum' in ranks.values() :
phylum_taxids = [taxid for taxid, rank in ranks.items() if rank == 'phylum']
phylum_name = ncbi.get_taxid_translator(phylum_taxids).get(phylum_taxids[0])
genus_taxids = [lineage[-2]]
genus_name = names.get(lineage[-2])
print(f"Phylum: {phylum_name}-Genus: {genus_name}")
elif len(lineage) >= 4:
phylum_taxids = [lineage[3]]
phylum_name = names.get(lineage[3])
genus_taxids = [lineage[-2]]
genus_name = names.get(lineage[-2])
print(f"Phylum: {phylum_name};Genus: {genus_name}")
else:
phylum_taxids = [lineage[2]]
phylum_name = names.get(lineage[2])
genus_taxids = [lineage[-2]]
genus_name = names.get(lineage[-2])
print(f"Phylum: {phylum_name};Genus: {genus_name}")
# 将分类信息添加到门分类字典中
if phylum_name:
phylum_dict[phylum_name] = phylum_dict.get(phylum_name, 0) + 1
# 如果存在属名称,则将分类信息添加到属分类字典中
if genus_name:
genus_dict[genus_name] = genus_dict.get(genus_name, 0) + 1
# 打印当前序列的分类信息
print(f"{lineage}\n{ranks}\n{names}")
print(f"Processing sequence {i + 1}/{len(df)}: {seq_id} - {phylum_name} - {genus_name}\n")
else:
print(f"Skipping sequence {seq_id} due to missing taxonomic ID")
# 暂停一秒,防止频繁请求被服务器限制
time.sleep(1)
# 将分类字典转化为DataFrame表格,并按照数量降序排列
phylum_df = pd.DataFrame(phylum_dict.items(), columns=["Phylum", "Count"]).sort_values(by="Count", ascending=False)
genus_df = pd.DataFrame(genus_dict.items(), columns=["Genus", "Count"]).sort_values(by="Count", ascending=False)
# 输出门和属的分类表格
print("Phylum Classification:\n", phylum_df)
print("\nGenus Classification:\n", genus_df)
# 将分类表格输出为CSV文件
phylum_df.to_csv("phylum_classification-re-5-6.csv", index=False)
genus_df.to_csv("genus_classification-re-5-6.csv", index=False)