python实验2：xml文件的分析与整合

最新推荐文章于 2024-10-08 12:37:10 发布

小何同学#

最新推荐文章于 2024-10-08 12:37:10 发布

阅读量210

点赞数 3

文章标签： python 笔记经验分享

本文链接：https://blog.csdn.net/m0_69913686/article/details/139375934

版权

1.1 从Drugbank的full database.xml文件中提取approved药物的id、name、type和/中的Molecular Weight,将targets中人类靶点gene name整理为一个，分隔的字符串。将每种药物的以上信息存入一个字典中。

import os
import xml.etree.ElementTree as ET

# 使用os.chdir()函数设置路径
os.chdir('D:\\桌面\\实验\\python实验\\python-class')

# 解析XML文件
tree = ET.parse('full database.xml')
root = tree.getroot()
# 定义一个空列表来存储每种药物的字典
all_drugs = []

# 遍历XML树，提取每种药物的信息
for drug in root.findall('{http://www.drugbank.ca}drug'):
    temp_drug = {}
    # 获取药物的groups
    groups = drug.find('{http://www.drugbank.ca}groups')
    # 遍历每个组
    for group in groups.findall('{http://www.drugbank.ca}group'):
        # 检查组是否为"approved"
        if group.text == 'approved':
            # 提取药物的id,name,type
            temp_drug['id'] = drug.find("{http://www.drugbank.ca}drugbank-id[@primary='true']").text
            temp_drug['name'] = drug.find('{http://www.drugbank.ca}name').text
            temp_drug['type'] = drug.get('type')
            
            # 设置初始标志和值
            found_weight = False
            weight_value = 0
            # 提取药物的分子量 (Molecular Weight)
            exp_prop = drug.find('{http://www.drugbank.ca}experimental-properties')
            cal_prop = drug.find('{http://www.drugbank.ca}calculated-properties')
            if exp_prop is not None:
                for prop in exp_prop.findall('{http://www.drugbank.ca}property'):
                    for kind in prop.findall('{http://www.drugbank.ca}kind'):
                        if kind.text == 'Molecular Weight':
                            weight_value = float(prop.find('{http://www.drugbank.ca}value').text)
                            found_weight = True
            if cal_prop is not None:
                for prop in cal_prop.findall('{http://www.drugbank.ca}property'):
                    for kind in prop.findall('{http://www.drugbank.ca}kind'):
                        if kind.text == 'Molecular Weight':
                            weight_value = float(prop.find('{http://www.drugbank.ca}value').text)
                            found_weight = True
            # 设置药物的分子量                
            temp_drug['Weight'] = weight_value
            
            # 设置初始标志和值
            found_target = False
            target_genes = []
            # 提取药物的靶点基因信息    
            targets = drug.find('{http://www.drugbank.ca}targets')
            if len(targets) != 0:
                for target in targets:
                    pp = target.find('{http://www.drugbank.ca}polypeptide')
                    if pp is not None:
                        gene_name = pp.find('{http://www.drugbank.ca}gene-name').text
                        if gene_name is not None:
                            target_genes.append(gene_name)
                            found_target = True
            # 设置药物的靶点基因信息    
            temp_drug['target'] = ','.join(filter(None, target_genes)) if found_target else '无'
                
            # 将药物信息添加到列表中    
            all_drugs.append(temp_drug)

1.2 构建所有药物的列表，按药物的Molecular降序排列后输出。

# 按药物的分子量降序排列
sorted_drugs = sorted(all_drugs, key=lambda x: x['Weight'], reverse=True)

# 输出所有药物
print(sorted_drugs[:10])
## [{'id': 'DB12872', 'name': 'Vonicog alfa', 'type': 'biotech', 'Weight': 20000000.0, 'target': 'F8,COL1A1'}, {'id': 'DB13932', 'name': 'Voretigene neparvovec', 'type': 'biotech', 'Weight': 1330000.0, 'target': 'RPE65'}, {'id': 'DB12839', 'name': 'Pegvaliase', 'type': 'biotech', 'Weight': 1000000.0, 'target': '无'}, {'id': 'DB00083', 'name': 'Botulinum toxin type A', 'type': 'biotech', 'Weight': 900000.0, 'target': 'SNAP25,RHOB'}, {'id': 'DB13192', 'name': 'Antihemophilic factor human', 'type': 'biotech', 'Weight': 480000.0, 'target': 'F9,F10'}, {'id': 'DB16662', 'name': 'Efanesoctocog alfa', 'type': 'biotech', 'Weight': 312000.0, 'target': 'F9,F10,LRP2,HSPG2'}, {'id': 'DB16007', 'name': 'Rurioctocog alfa pegol', 'type': 'biotech', 'Weight': 269812.0, 'target': 'VWF,F8'}, {'id': 'DB00025', 'name': 'Antihemophilic factor, human recombinant', 'type': 'biotech', 'Weight': 264725.5, 'target': 'F10,F9,VWF,PHYH,ASGR2,HSPA5,CALR,CANX,LMAN1,LRP1,MCFD2'}, {'id': 'DB06372', 'name': 'Rilonacept', 'type': 'biotech', 'Weight': 251000.0, 'target': 'IL1B,IL1A,IL1RN'}, {'id': 'DB11607', 'name': 'Efmoroctocog alfa', 'type': 'biotech', 'Weight': 220000.0, 'target': 'VWF'}]

改用dataframe格式查看并写入

import pandas as pd

# 使用dataframe格式查看
drugs_data = pd.DataFrame(sorted_drugs)
print(drugs_data)
##            id  ...       target
## 0     DB12872  ...    F8,COL1A1
## 1     DB13932  ...        RPE65
## 2     DB12839  ...            无
## 3     DB00083  ...  SNAP25,RHOB
## 4     DB13192  ...       F9,F10
## ...       ...  ...          ...
## 4384  DB18705  ...            无
## 4385  DB18712  ...            无
## 4386  DB18713  ...            无
## 4387  DB18714  ...            无
## 4388  DB18717  ...            无
## 
## [4389 rows x 5 columns]

# 写入CSV文件
drugs_data.to_csv('drugs_data.csv', index=False, encoding='utf-8-sig')