kegg 上ko通路数据的获取
最近一直在弄kegg的一些东西,就把心得写下来吧。
接着自己上面的文章:
ko对应K号的表的下载
得到的json文件内容部分如下:
{
"name":"ko00001",
"children":[
{
"name":"09100 Metabolism",
"children":[
{
"name":"09101 Carbohydrate metabolism",
"children":[
{
"name":"00010 Glycolysis \/ Gluconeogenesis [PATH:ko00010]",
"children":[
{
"name":"K00844 HK; hexokinase [EC:2.7.1.1]"
},
{
"name":"K12407 GCK; glucokinase [EC:2.7.1.2]"
},
{
"name":"K00845 glk; glucokinase [EC:2.7.1.2]"
},
。。。。。。。
下面附上json字符串的处理python脚本。
import json
def get_K_ko_dict(K_ko_file):
K_ko_dict = {}
# K_list = []
with open(K_ko_file, "r")as f:
K_ko_file_content = json.load(f)
for children_info in K_ko_file_content.get("children"):
for next_children_info in children_info.get("children"):
for third_children_info in next_children_info.get("children"):
name_info = third_children_info.get("name")
find_pat = r":ko(.*?)]"
ko_num = re.findall(find_pat, name_info)
if ko_num:
K_ko_dict["ko"+ko_num[0]] = []
if third_children_info.get("children"):
for fourth_children_info in third_children_info.get("children"):
K_name = fourth_children_info.get("name").split(" ")[0]
K_ko_dict["ko" + ko_num[0]].append(K_name)
# K_list.append(K_name)
else:
un_unmber = "un" + name_info.split(" ")[0]
# 因为有些通路没有只给了一个号,没有ko通路,所以暂时这么处理。
K_ko_dict[un_unmber] = []
if third_children_info.get("children"):
for fourth_children_info in third_children_info.get("children"):
K_name = fourth_children_info.get("name").split(" ")[0]
K_ko_dict[un_unmber].append(K_name)
# K_list.append(K_name)
return K_ko_dict
if __name__ == '__main__':
# K_ko_file的文件路径
# 例如: E:\meta\K_ko\ko00001.json
get_K_ko_dict(K_ko_file)
# 得到的部分结果如下:
{'ko00010': ['K00844', 'K12407'.......'K20118', 'K02753'],
'ko00020': ['K01647', 'K01648', 'K15230', 'K15231', 'K05942', 'K01681', 'K01682', 'K00031', 'K00030'.......
.......}
得到的结果是对应的pathway包含的K号的字典。
可以用于剩下的分析。