from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import time
import matplotlib.pyplot as plt
import re
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from Bio import Entrez
import os
import requests
import xml.etree.ElementTree as ET
def get_clinvar_by_accession(accession, pos, ref, alt, email, max_retry=3):
"""
通过ClinVar查询变异信息
参数:
accession: 参考序列的Accession (如"NC_000013.11")
pos: 在参考序列上的位置 (1-based)
ref: 参考序列的碱基
alt: 突变碱基
email: 邮箱
max_retry: 最大重试次数
返回:
包含ClinVar变异信息的字典
"""
Entrez.email = email
# 构建ClinVar查询: 使用accession:g.REF>ALT格式[^1]
# query = f"{accession}:g.{pos}{ref}>{alt}"
query = f"{accession}:g.{pos}"
print(query)
clinvar_info = {
'rsid': None,
'variation_name': None,
'clinical_significance': None,
'rcv_accession': None
}
for attempt in range(max_retry):
try:
# 搜索ClinVar数据库
handle = Entrez.esearch(db="clinvar", term=query, retmax=1)
record = Entrez.read(handle)
handle.close()
id_list = record["IdList"]
if not id_list:
return clinvar_info
# 获取ClinVar记录的详细信息
handle = Entrez.esummary(db="clinvar", id=id_list[0], report="full")
# 使用parse代替read,并关闭验证
parser = Entrez.parse(handle, validate=False)
clinvar_records = list(parser)
handle.close()
if not clinvar_records:
return clinvar_info
summary = clinvar_records[0]
# 解析ClinVar数据
clinvar_info['rcv_accession'] = summary.get('accession', 'N/A')
clinvar_info['variation_name'] = summary.get('title', 'N/A')
# 注意:clinical_significance字段可能是一个字典,其中包含'description'
if 'clinical_significance' in summary:
# 检查summary['clinical_significance']的类型,如果是字典则获取description
if isinstance(summary['clinical_significance'], dict):
clinvar_info['clinical_significance'] = summary['clinical_significance'].get('description', 'N/A')
else:
clinvar_info['clinical_significance'] = summary['clinical_significance']
else:
clinvar_info['clinical_significance'] = 'N/A'
# 提取rs编号
if 'xrefs' in summary:
for xref in summary['xrefs']:
db = xref.get('db')
if db == 'dbSNP':
clinvar_info['rsid'] = xref.get('id', 'N/A')
break
return clinvar_info
except Exception as e:
print(f"尝试 {attempt+1} 失败: {str(e)}")
time.sleep(2)
return clinvar_info
def sanger_blast_analysis(abi_file, threshold=20, blast_db="nt", wait_time=600):
"""
Sanger测序质量评估 + BLAST比对分析(限制为智人)
参数:
abi_file: .ab1文件路径
threshold: 质量阈值 (默认Q<20为低质量)
blast_db: BLAST数据库 (默认nt)
wait_time: 等待BLAST结果时间(秒)
返回:
包含质量评估和BLAST结果的字典
"""
# ===== 1. 质量评估 =====
record = SeqIO.read(abi_file, "abi")
seq = str(record.seq)
qualities = record.letter_annotations["phred_quality"]
# 标记低质量区域
marked_seq = list(seq)
for i, q in enumerate(qualities):
if q < threshold:
marked_seq[i] = 'N'
# 修剪低质量区域后的序列
trimmed_seq = ''.join(marked_seq).replace('N', '')
# ===== 2. BLAST比对(限制为智人)=====
print("提交BLAST比对(限制为智人)...")
blast_handle = NCBIWWW.qblast(
program="blastn",
database=blast_db,
sequence=trimmed_seq,
hitlist_size=5,
format_type="XML",
entrez_query="txid9606[Organism]"
)
print(f"等待BLAST结果 ({wait_time}秒)...")
time.sleep(wait_time)
# ===== 3. 解析BLAST结果 =====
blast_records = NCBIXML.parse(blast_handle)
blast_results = []
for record in blast_records:
for alignment in record.alignments:
if "Homo sapiens" not in alignment.title and "human" not in alignment.title.lower():
continue
for hsp in alignment.hsps:
gene_name = "Unknown"
if 'gene' in alignment.title.lower():
gene_match = re.search(r'gene=([^\]]+)', alignment.title)
if gene_match:
gene_name = gene_match.group(1)
nc_id = "Unknown"
id_match = re.search(r'\|([A-Z]{2}_?\d+\.\d)\|', alignment.hit_id)
if id_match:
nc_id = id_match.group(1)
mutations = []
for pos, (q, s) in enumerate(zip(hsp.query, hsp.sbjct)):
if q != s and q != '-' and s != '-':
mutations.append({
"position": hsp.sbjct_start + pos,
"ref_base": s,
"query_base": q,
"clinvar_info": None # 初始化为None
})
blast_results.append({
"accession": alignment.accession,
"nc_id": nc_id,
"gene": gene_name,
"title": alignment.title,
"e_value": hsp.expect,
"identity": hsp.identities / hsp.align_length * 100,
"mutations": mutations,
"alignment": hsp
})
# ===== 4. 查询ClinVar变异信息 =====
print("查询ClinVar变异信息...")
for result in blast_results:
for mutation in result['mutations']:
# 使用优化方法查询ClinVar信息
clinvar_info = get_clinvar_by_accession(
accession=result['nc_id'],
pos=mutation['position'],
ref=mutation['ref_base'],
alt=mutation['query_base'],
email=Entrez.email
)
mutation['clinvar_info'] = clinvar_info
time.sleep(1) # 避免请求过于频繁
# ===== 5. 可视化 =====
plt.figure(figsize=(12, 6))
plt.subplot(211)
plt.plot(range(len(qualities)), qualities, 'b-')
plt.axhline(y=threshold, color='r', linestyle='--', label=f'阈值 (Q={threshold})')
plt.title("Sanger测序质量分布")
plt.xlabel("碱基位置")
plt.ylabel("Phred质量分数")
plt.legend()
quality_plot_path = os.path.splitext(abi_file)[0] + "_quality.png"
plt.savefig(quality_plot_path)
plt.close()
return {
"original_sequence": seq,
"trimmed_sequence": trimmed_seq,
"quality_scores": qualities,
"average_quality": sum(qualities)/len(qualities),
"quality_plot": quality_plot_path,
"blast_results": blast_results
}
def save_blast_to_excel(analysis_result, output_path="blast_results.xlsx"):
"""
将BLAST结果保存到Excel,每个匹配项一个工作表
"""
wb = Workbook()
wb.remove(wb.active)
summary_ws = wb.create_sheet(title="摘要")
summary_ws.append(["Sanger测序分析报告"])
summary_ws.append([])
summary_ws.append(["原始序列长度", len(analysis_result["original_sequence"])])
summary_ws.append(["修剪后序列长度", len(analysis_result["trimmed_sequence"])])
summary_ws.append(["平均质量分数", f"{analysis_result['average_quality']:.2f}"])
summary_ws.append(["质量分布图路径", analysis_result["quality_plot"]])
summary_ws.append([])
summary_ws.append(["BLAST匹配数", len(analysis_result["blast_results"])])
summary_ws.append([])
summary_ws.append(["Accession", "基因", "NC编号", "E值", "相似度(%)", "突变数"])
for i, result in enumerate(analysis_result["blast_results"]):
sheet_name = f"匹配{i+1}_{result['gene'][:20]}"[:31]
sheet_name = re.sub(r'[\\/*?[\]:]', '', sheet_name)
ws = wb.create_sheet(title=sheet_name)
ws.append(["Accession", result["accession"]])
ws.append(["基因", result["gene"]])
ws.append(["NC编号", result["nc_id"]])
ws.append(["标题", result["title"]])
ws.append(["E值", result["e_value"]])
ws.append(["相似度(%)", f"{result['identity']:.2f}"])
ws.append([])
ws.append(["比对详情"])
ws.append(["查询序列", result["alignment"].query])
ws.append(["匹配序列", result["alignment"].match])
ws.append(["目标序列", result["alignment"].sbjct])
ws.append([])
if result["mutations"]:
ws.append(["突变位点"])
ws.append(["位置", "参考碱基", "查询碱基", "rs编号", "临床意义", "变异名称", "RCV编号"])
for mutation in result["mutations"]:
clinvar_info = mutation.get("clinvar_info", {})
ws.append([
mutation["position"],
mutation["ref_base"],
mutation["query_base"],
clinvar_info.get("rsid", "N/A"),
clinvar_info.get("clinical_significance", "N/A"),
clinvar_info.get("variation_name", "N/A"),
clinvar_info.get("rcv_accession", "N/A")
])
else:
ws.append(["未检测到突变"])
summary_ws.append([
result["accession"],
result["gene"],
result["nc_id"],
result["e_value"],
f"{result['identity']:.2f}",
len(result["mutations"])
])
wb.save(output_path)
print(f"已保存结果至: {output_path}")
return output_path
# 示例使用
if __name__ == "__main__":
Entrez.email = "976960558@qq.com" # 替换为您的邮箱
abi_file = "122-G2.ab1" # 替换为您的AB1文件路径
result = sanger_blast_analysis(abi_file)
excel_path = save_blast_to_excel(result, "sanger_blast_results.xlsx")
print("\n分析摘要:")
print(f"原始序列长度: {len(result['original_sequence'])}")
print(f"修剪后序列长度: {len(result['trimmed_sequence'])}")
print(f"平均质量分数: {result['average_quality']:.2f}")
print(f"质量分布图保存至: {result['quality_plot']}")
print(f"发现 {len(result['blast_results'])} 个智人匹配项")
print("\nBLAST比对结果:")
for i, hit in enumerate(result["blast_results"]):
print(f"\n匹配 #{i+1}:")
print(f" 基因: {hit['gene']}")
print(f" NC编号: {hit['nc_id']}")
print(f" 相似度: {hit['identity']:.2f}%")
print(f" E值: {hit['e_value']}")
if hit['mutations']:
print(f" 发现 {len(hit['mutations'])} 个突变位点:")
for mut in hit['mutations']:
clinvar = mut['clinvar_info']
print(f" 位置 {mut['position']}: {mut['ref_base']}→{mut['query_base']}")
print(f" rs编号: {clinvar.get('rsid', 'N/A')}")
print(f" 变异名称: {clinvar.get('variation_name', 'N/A')}")
print(f" 临床意义: {clinvar.get('clinical_significance', 'N/A')}")
print(f" RCV编号: {clinvar.get('rcv_accession', 'N/A')}")
print(f"\n完整结果已保存至Excel文件: {excel_path}")
使用方法1:使用Entrez.parse代替Entrez.read 请把优化后的代码发给我
最新发布