TCMSP爬虫批量得到化合物名称或Mol ID
见我的另一篇博客:http://t.csdnimg.cn/lTg6k
通过化合物名称批量获得SMILES序列号/CID序列号
import pubchempy
import pandas as pd
import itertools
import time
def read_file(file_name) -> list:
"""
读取化合物名称
:param file_name: 含有化合物名称的txt或csv格式文件(含路径)
:return: list
"""
try:
df = pd.read_csv(file_name, sep=',', encoding='utf-8-sig')
if 'Molecule Name' in df.columns.values:
return df['Molecule Name'].tolist()
except pd.errors.ParserError: # 如果文件不是CSV格式,说明是txt格式,继续尝试
pass
with open(file_name, 'r', encoding='utf-8-sig') as file:
return [line.strip() for line in file.readlines()]
def get_compound_info(name: list):
"""
获取化合物信息
:param name: 化合物名称列表
"""
try:
compounds = pubchempy.get_compounds(name, 'name')
for compound in compounds:
result = {
'name': name,
'molecular_formula': compound.molecular_formula,
'molecular_weight': compound.molecular_weight,
'smiles': compound.isomeric_smiles,
'synonyms': compound.synonyms,
'cid': compound.cid
}
print(
f"CID: {compound.cid}\tMass: {compound.exact_mass}\tName: {compound.iupac_name}\tMolfor: {compound.molecular_formula}\tSmi: {compound.isomeric_smiles}\tSyn: {compound.synonyms}")
yield result
except Exception as e:
print("occurred error when processing compound", name)
print(str(e))
return
def chemName_to_Smiles_or_CID(input_file_name, output_file_name):
"""
通过化合物名称批量获得SMILES序列号/CID序列号
:param input_file_name:含有化合物名称的txt格式文件(含路径),一行一个化合物名称,换行符分隔
:param output_file_name: 输出文件(含路径)
"""
start_time = time.time()
# 输入
name_list = read_file(input_file_name)
# 生成器优化循环遍历
results = itertools.chain.from_iterable(get_compound_info(name) for name in name_list)
# 输出
dataframe = pd.DataFrame(results)
dataframe.to_csv(output_file_name, index=False, sep=',')
# 结束计时
end_time = time.time()
print(f"Total queries: {len(dataframe)}")
print(f"Elapsed time: {end_time - start_time:.2f} seconds")
if __name__ == '__main__':
print("start convert")
input_file_name = "1-TCMSP.csv"
output_file_name = "smiles_and_cid.csv"
chemName_to_Smiles_or_CID(input_file_name, output_file_name)