1.1 从Drugbank的full database.xml文件中提取approved药物的id、name、type和/中的Molecular Weight,将targets中人类靶点gene name整理为一个,分隔的字符串。将每种药物的以上信息存入一个字典中。
import os
import xml.etree.ElementTree as ET
# 使用os.chdir()函数设置路径
os.chdir('D:\\桌面\\实验\\python实验\\python-class')
# 解析XML文件
tree = ET.parse('full database.xml')
root = tree.getroot()
# 定义一个空列表来存储每种药物的字典
all_drugs = []
# 遍历XML树,提取每种药物的信息
for drug in root.findall('{http://www.drugbank.ca}drug'):
temp_drug = {}
# 获取药物的groups
groups = drug.find('{http://www.drugbank.ca}groups')
# 遍历每个组
for group in groups.findall('{http://www.drugbank.ca}group'):
# 检查组是否为"approved"
if group.text == 'approved':
# 提取药物的id,name,type
temp_drug['id'] = drug.find("{http://www.drugbank.ca}drugbank-id[@primary='true']").text
temp_drug['name'] = drug.find('{http://www.drugbank.ca}name').text
temp_drug['type'] = drug.get('type')
# 设置初始标志和值
found_weight = False
weight_value = 0
# 提取药物的分子量 (Molecular Weight)
exp_prop = drug.find('{http://www.drugbank.ca}experimental-properties')
cal_prop = drug.find('{http://www.drugbank.ca}calculated-properties')
if exp_prop is not None:
for prop in exp_prop.findall('{http://www.drugbank.ca}property'):
for kind in prop.findall('{http://www.drugbank.ca}kind'):
if kind.text == 'Molecular Weight':
weight_value = float(prop.find('{http://www.drugbank.ca}value').text)
found_weight = True
if cal_prop is not None:
for prop in cal_prop.findall('{http://www.drugbank.ca}property'):
for kind in prop.findall('{http://www.drugbank.ca}kind'):
if kind.text == 'Molecular Weight':
weight_value = float(prop.find('{http://www.drugbank.ca}value').text)
found_weight = True
# 设置药物的分子量
temp_drug['Weight'] = weight_value
# 设置初始标志和值
found_target = False
target_genes = []
# 提取药物的靶点基因信息
targets = drug.find('{http://www.drugbank.ca}targets')
if len(targets) != 0:
for target in targets:
pp = target.find('{http://www.drugbank.ca}polypeptide')
if pp is not None:
gene_name = pp.find('{http://www.drugbank.ca}gene-name').text
if gene_name is not None:
target_genes.append(gene_name)
found_target = True
# 设置药物的靶点基因信息
temp_drug['target'] = ','.join(filter(None, target_genes)) if found_target else '无'
# 将药物信息添加到列表中
all_drugs.append(temp_drug)
1.2 构建所有药物的列表,按药物的Molecular降序排列后输出。
# 按药物的分子量降序排列
sorted_drugs = sorted(all_drugs, key=lambda x: x['Weight'], reverse=True)
# 输出所有药物
print(sorted_drugs[:10])
## [{'id': 'DB12872', 'name': 'Vonicog alfa', 'type': 'biotech', 'Weight': 20000000.0, 'target': 'F8,COL1A1'}, {'id': 'DB13932', 'name': 'Voretigene neparvovec', 'type': 'biotech', 'Weight': 1330000.0, 'target': 'RPE65'}, {'id': 'DB12839', 'name': 'Pegvaliase', 'type': 'biotech', 'Weight': 1000000.0, 'target': '无'}, {'id': 'DB00083', 'name': 'Botulinum toxin type A', 'type': 'biotech', 'Weight': 900000.0, 'target': 'SNAP25,RHOB'}, {'id': 'DB13192', 'name': 'Antihemophilic factor human', 'type': 'biotech', 'Weight': 480000.0, 'target': 'F9,F10'}, {'id': 'DB16662', 'name': 'Efanesoctocog alfa', 'type': 'biotech', 'Weight': 312000.0, 'target': 'F9,F10,LRP2,HSPG2'}, {'id': 'DB16007', 'name': 'Rurioctocog alfa pegol', 'type': 'biotech', 'Weight': 269812.0, 'target': 'VWF,F8'}, {'id': 'DB00025', 'name': 'Antihemophilic factor, human recombinant', 'type': 'biotech', 'Weight': 264725.5, 'target': 'F10,F9,VWF,PHYH,ASGR2,HSPA5,CALR,CANX,LMAN1,LRP1,MCFD2'}, {'id': 'DB06372', 'name': 'Rilonacept', 'type': 'biotech', 'Weight': 251000.0, 'target': 'IL1B,IL1A,IL1RN'}, {'id': 'DB11607', 'name': 'Efmoroctocog alfa', 'type': 'biotech', 'Weight': 220000.0, 'target': 'VWF'}]
改用dataframe格式查看并写入
import pandas as pd
# 使用dataframe格式查看
drugs_data = pd.DataFrame(sorted_drugs)
print(drugs_data)
## id ... target
## 0 DB12872 ... F8,COL1A1
## 1 DB13932 ... RPE65
## 2 DB12839 ... 无
## 3 DB00083 ... SNAP25,RHOB
## 4 DB13192 ... F9,F10
## ... ... ... ...
## 4384 DB18705 ... 无
## 4385 DB18712 ... 无
## 4386 DB18713 ... 无
## 4387 DB18714 ... 无
## 4388 DB18717 ... 无
##
## [4389 rows x 5 columns]
# 写入CSV文件
drugs_data.to_csv('drugs_data.csv', index=False, encoding='utf-8-sig')