记载一下如何汇总4大数据库的数据
上代码:
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 26 19:41:40 2021
@author: recherHE
@Email:recherHE.@163.COM
"""
import pandas as pd
import os,sys
def read_big_csv(input_path):
"""
Parameters
----------
input_path : str
csc file path
Returns
-------
res_df : pandas.DataFrame
"""
df_chunk=pd.read_csv(input_path,chunksize=10000)
res_chunk=[]
for chunk in df_chunk:
res_chunk.append(chunk)
res_df=pd.concat(res_chunk)
return res_df
def pubchem(pubchem_folder):
"""
summary pubchem asssay infomation of the folder
Parameters
----------
pubchem_folder : str
pubchem folder path
Returns
-------
None.
"""
df_all = pd.DataFrame()
files = os.listdir(pubchem_folder)
for file in files:
try:
df = pd.read_csv(os.path.join(pubchem_folder,file))
df.dropna(subset=['SMILES'],inplace=True)
df["DB"] = "PUBCHEM"
df["TYPE"] = df["Standard Type"]
if "Standard Value" in df.columns:
order = ["PUBCHEM_CID", "DB", "SMILES", "Standard Value","Standard Units", "TYPE"]
df = df[order]
df_all = pd.concat([df_all,df],ignore_index=True)
print(f"{file} finished")
except:
print(f"{file} failed")
df_all.columns = ["ID", "DB", "SMILES", "ACTIVITY","UNITS", "TYPE"]
df_all.dropna(inplace=True)
df_all.to_csv(os.path.join(pubchem_folder,"PUCHEM_SUMMARY.csv"),index =False)
def data_summary(Chembl_file,Excape_file,Binding_DB_file,pubchem_folder,outfile):
"""
summary data of 4 database(Chembl,Excape,Binding_DB,pubchem)
Parameters
----------
Chembl_file : str
chembl file path
Excape_file : str
Excape file path
Binding_DB_file : str
Binding_DB file path
pubchem_folder : str
pubchem folder path
outfile : str
output file path(csv)
Returns
-------
None.
"""
df_exspace = read_big_csv(Excape_file)
df_chembl = read_big_csv(Chembl_file)
df_bindingDB = read_big_csv(Binding_DB_file)
df_PUBCHEM = read_big_csv(os.path.join(pubchem_folder,"PUCHEM_SUMMARY.csv"))
summary_columns = ["ID", "DB", "SMILES", "ACTIVITY", "UNITS", "TYPE"]
order = ["Original_Entry_ID","DB","SMILES","pXC50"]
df_exspace = df_exspace[order]
df_exspace["pXC50"] = df_exspace["pXC50"].map(lambda x: '%.1f' %(10**-x * 10**9))
df_exspace["UNITS"] = "nM"
df_exspace["TYPE"] = "IC50"
df_exspace.columns = summary_columns
order_1 = ['monomerid', 'smile', 'affinity', 'affinity_type']
df_bindingDB = df_bindingDB[order_1]
df_bindingDB.columns = ["ID","SMILES","ACTIVITY","TYPE"]
df_bindingDB["DB"] = "BindingDB"
df_bindingDB["UNITS"] = "nM"
df_bindingDB["ACTIVITY"] = df_bindingDB["ACTIVITY"].str.replace("<|>", "")
df_bindingDB.columns = ["ID", "SMILES", "ACTIVITY", "TYPE", "DB", "UNITS"]
order_2 = ['molecule_chembl_id', 'canonical_smiles', 'value', 'units', 'type']
df_chembl = df_chembl[order_2]
df_chembl.columns = ["ID", "SMILES", "ACTIVITY", "UNITS" ,"TYPE"]
df_chembl["DB"] = "CHEMBL"
df_chembl.columns = ["ID", "SMILES", "ACTIVITY", "UNITS", "TYPE", "DB"]
df_all = pd.concat([df_exspace,df_bindingDB,df_chembl,df_PUBCHEM],ignore_index=True)
df_all.dropna(subset=["SMILES", "ACTIVITY"], inplace=True)
df_all = df_all.to_csv(outfile,index =False)
def main(argv=[__name__]):
pubchem_folder = sys.argv[1]
Chembl_file = sys.argv[2]
Excape_file = sys.argv[3]
Binding_DB_file = sys.argv[4]
outfile = sys.argv[5]
pubchem(pubchem_folder)
data_summary(Chembl_file,Excape_file,Binding_DB_file,pubchem_folder,outfile)
if __name__ == '__main__':
print("""
==========================================================================================================================
Please enter in order {puhcme_folder path} {Chembl file path} {Excape file path} {Binding_DB_file path} {outfile path}
==========================================================================================================================
""")
sys.exit(main(sys.argv))
输出结果展示: