网站地址https://dbaasp.org/peptide-card?id=DBAASPR_21623
通过requests保存html文件之后(可以使用此网站辅助获取),通过pymysql连接数据库保存数据,若有建议请大神指正https://curlconverter.com/https://curlconverter.com/
import json
import pymysql
import os
import re
import pandas as pd
import csv
#
# def save_data(data):
connection = pymysql.connect(host='',
user='',
password='',
database='')
# 创建游标对象
cur = connection.cursor()
create_table_query1 = '''
CREATE TABLE IF NOT EXISTS z_hemoly (
`id` INT,
`activityMeasureForLysisValue` VARCHAR(255),
`concentration` VARCHAR(255),
`ph` VARCHAR(255),
`ionicStrength` VARCHAR(255),
`saltType` VARCHAR(255),
`note` VARCHAR(255),
`reference` VARCHAR(255),
`activity` FLOAT,
`targetCell_name` VARCHAR(255),
`activityMeasureForLysisGroup_name` VARCHAR(255),
`unit_name` VARCHAR(255),
`unit_description` VARCHAR(255),
r_ID INT
)'''
create_table_query2 = '''
CREATE TABLE IF NOT EXISTS z_sourceGenes (
`id` INT,
`kingdom` VARCHAR(255),
`source` VARCHAR(255),
`subkingdom` VARCHAR(255),
`plasmid` VARCHAR(255),
`gene` VARCHAR(255),
`chromLoc` VARCHAR(255),
`orientation` VARCHAR(255),
`geneInSequence` VARCHAR(500),
`locus` VARCHAR(255),
`note` VARCHAR(255),
`dblink_name` VARCHAR(255),
`dblink_url` VARCHAR(500),
r_ID INT
)'''
create_table_query3 = '''
CREATE TABLE IF NOT EXISTS z_targetActivities (
`id` INT,
`targetSpecies_name` VARCHAR(255),
`targetSpecies_description` VARCHAR(255),
`activityMeasureGroup_name` VARCHAR(255),
`activityMeasureValue` VARCHAR(255),
`concentration` VARCHAR(255),
`unit_name` VARCHAR(255),
`unit_description` VARCHAR(255),
`ph` VARCHAR(255),
`ionicStrength` VARCHAR(255),
`saltType` VARCHAR(255),
`medium_name` VARCHAR(255),
`medium_description` VARCHAR(255),
`cfu` VARCHAR(255),
`cfuGroup` VARCHAR(255),
`note`VARCHAR(500),
`reference` VARCHAR(255),
`activity` VARCHAR(255),
r_ID INT
)'''
create_table_query4 = '''
CREATE TABLE IF NOT EXISTS z_antiActivities (
`id` INT,
`targetSpecies_name` text,
`activityMeasureGroup_name` text,
`activityMeasureGroup_description` text,
`activityMeasureValue` text,
`concentration` text,
`unit_name` text,
`unit_description` text,
`medium_name` text,
`medium_description` text,
`cfu` text,
`cfuGroup` text,
`note` text,
`reference` text,
`peptideId` text,
r_ID INT)
'''
# cursor.execute(create_table_query1)
# cursor.execute(create_table_query2)
# cursor.execute(create_table_query3)
# cursor.execute(create_table_query4)
folder_path = r"D:\demo1"
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
if os.path.isfile(file_path) and filename.endswith(".html"):
with open(file_path, "r", encoding='utf-8') as file:
html_content = file.read()
data = json.loads(html_content)
print(data)
data['r_id'] = data["id"]
output_file = 'info.csv'
# 1.info
id = data["id"]
print(id)
name = data["name"]
synthesisType = data['synthesisType']['name']
complexity = data['complexity']['name']
sequence = data["sequence"]
nTerminus = data["nTerminus"]
cTerminus = data["cTerminus"]
sequenceLength = data["sequenceLength"]
targetGroups = []
for i in data['targetGroups']:
targetGroups.append(i['name'])
targetObjects = ''
try:
targetObjects = data['targetObjects'][0]['name']
except:
pass
noteReference = ''
try:
noteReference = data['noteReference']
noteReference = re.sub(r'<.*?>', '', noteReference)
except:
pass
# 构建插入语句
# insert_statement = '''INSERT INTO dbaasp_info (id, name, synthesisType, complexity, sequence, nTerminus, cTerminus, sequenceLength, targetGroups, targetObjects, noteReference)
# VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) '''
# values = (
# id, name, synthesisType, complexity, sequence, nTerminus, cTerminus, sequenceLength,
# ', '.join(targetGroups), targetObjects, noteReference)
# cur.execute(insert_statement, values)
with open(output_file, mode='a', encoding='utf-8',newline='') as file:
writer = csv.writer(file)
writer.writerow([id, name, synthesisType, complexity, sequence, nTerminus, cTerminus,
sequenceLength, ', '.join(targetGroups), targetObjects, noteReference])
# 2.uniprots
if 'uniprots' in data and data['uniprots']:
df_uniprots = pd.json_normalize(data, ['uniprots'], ["r_id"])
df_uniprots.fillna('', inplace=True)
columns_uniprots = ['id', 'uniprotId', 'description', 'uniprotUrl', 'proPeptide.name',
'proPeptide.description', 'r_id']
df_uniprots = df_uniprots[columns_uniprots]
# df_uniprots.to_sql('dbaasp_uniprots', cur, if_exists='append', index=False)
cur.executemany(
"INSERT INTO dbaasp_uniprots (id, uniprotId,description,uniprotUrl,proPeptide_name,proPeptide_description,r_id) "
"VALUES (%s, %s, %s, %s, %s, %s, %s)",
df_uniprots.values.tolist())
# 3.monomers
if 'monomers' in data and data['monomers']:
df_Sequence = pd.json_normalize(data, ['monomers'], ['r_id'])
columns = ['id', 'dbaaspId', 'name', 'majorName', 'sequence', 'sequenceLength', 'r_id']
df_Sequence = df_Sequence[columns]
# df_Sequence.to_sql('dbaasp_dbaaspr_monomers',cur,if_exists='append',index=False)
cur.executemany(
"INSERT INTO dbaasp_monomers (id, dbaaspId, name, majorName, sequence, sequenceLength, r_id) "
"VALUES (%s, %s, %s, %s, %s, %s, %s)",
df_Sequence.values.tolist())
# 4.处理 interchainBonds 字段
if 'interchainBonds' in data and data['interchainBonds']:
df_interchainBonds = pd.json_normalize(data, ['interchainBonds'], ['r_id'])
df_interchainBonds.fillna('', inplace=True)
# df_interchainBonds.to_sql('dbaasp_dbaaspr_interbonds', cur, if_exists='append', index=False)
cur.executemany(
"INSERT INTO dbaasp_interbonds (id, chain1, chain2, chain3, chain4, note,interChainBond_name,interChainBond_description, r_id) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
df_interchainBonds.values.tolist())
# 5.intrachainBonds
if 'intrachainBonds' in data and data['intrachainBonds']:
df_intrachainBonds = pd.json_normalize(data, ['intrachainBonds'], ['r_id'])
df_intrachainBonds.fillna('', inplace=True)
# df_intrachainBonds.to_sql('dbaasp_intrabonds', connection, if_exists='append', index=False)
cur.executemany(
"INSERT INTO dbaasp_intrabonds (id, position1,position2,note,type_id,type_name,"
"type_description,cycleType_id,cycleType_intraChainBondTypeId,cycleType_name,cycleType_description,"
"chainParticipating_id,chainParticipating_name,chainParticipating_description,r_id) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
df_intrachainBonds.values.tolist())
# 6.处理 physicoChemicalProperties 字段
if 'physicoChemicalProperties' in data and data['physicoChemicalProperties']:
df_physicoChemicalProperties = pd.json_normalize(data, ['physicoChemicalProperties'], ["r_id"])
df_physicoChemicalProperties.fillna('', inplace=True)
# df_physicoChemicalProperties.to_sql('dbaasp_dbaaspr_properties', cur, if_exists='append', index=False)
cur.executemany(
"INSERT INTO dbaasp_properties (name, value,r_id) "
"VALUES (%s, %s, %s)",
df_physicoChemicalProperties.values.tolist())
# 7.处理 structureModel 字段
if "structureModel" in data and data["structureModel"]:
structure_model = data["structureModel"]
df_structureModel = pd.json_normalize(structure_model)
df_structureModel["id"] = data["id"]
df_structureModel.fillna('', inplace=True)
# df_structureModel.to_sql('dbaasp_dbaaspr_structuremodel', cur, if_exists='append', index=False)
cur.executemany(
"INSERT INTO dbaasp_structuremodel (representativeStructureUrl, representativeStructureNGLUrl,selfConsistencyUrl"
",secondaryStructureUrl,trajectoryDCDUrl,trajectoryPSFUrl,trajectoryNGLUrl,trajectoryWithWaterDCDUrl,"
"trajectoryWithWaterPSFUrl,protocolId,protocolDescription, r_id) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
df_structureModel.values.tolist())
# 8.处理 articles 字段
for item_articles in data['articles']:
if item_articles['id']is not None:
id = item_articles['id']
else:
continue
if item_articles['year']is not None:
year = item_articles['year']
else:
continue
if item_articles['volume']is not None:
volume = item_articles['volume']
else:
continue
if item_articles['pages']is not None:
pages = item_articles['pages']
else:
continue
if item_articles['title']is not None:
title = item_articles['title']
else:
continue
if item_articles['additional']is not None:
additional = item_articles['additional']
else:
continue
if item_articles['authors']is not None:
authors = str(item_articles['authors'])
else:
continue
if 'journal' in item_articles and item_articles['journal'] is not None:
journal_name = item_articles['journal']['name']
journal_description = item_articles['journal']['description']
else:
continue
if item_articles['pubmed'] is not None:
pubmed_pubmedId = item_articles['pubmed'].get('pubmedId', None)
pubmed_pubmedUrl = item_articles['pubmed'].get('pubmedUrl', None)
else:
continue
insert_query = '''INSERT INTO dbaasp_dbaaspr_articles (id, year, volume, pages, title, additional, authors, journal_name, journal_description, pubmed_pubmedId, pubmed_pubmedUrl, r_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) '''
values = (id, year, volume,pages, title, additional, authors, journal_name, journal_description,
pubmed_pubmedId, pubmed_pubmedUrl,id)
cur.execute(insert_query, values)
# 9.处理 coordinationBonds 字段
if 'coordinationBonds' in data and data['coordinationBonds']:
df_coordinationBonds = pd.json_normalize(data, ['coordinationBonds'], ['r_id'])
df_coordinationBonds.fillna('', inplace=True)
# df_coordinationBonds.to_sql('dbaasp_dbaaspr_coordinationbonds', cur, if_exists='append', index=False)
cur.executemany(
"INSERT INTO dbaasp_coordinationbonds (id, peptideId, metalIons, positionOfAa, group, reference, bondNumber, r_id) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
df_coordinationBonds.values.tolist())
# 10.处理 unusualAminoAcids 字段
if 'unusualAminoAcids' in data and data['unusualAminoAcids']:
df_unusualAminoAcids = pd.json_normalize(data, ['unusualAminoAcids'], ['r_id'])
df_unusualAminoAcids.fillna('', inplace=True)
# df_unusualAminoAcids.to_sql('dbaasp_dbaaspr_unusualaminoacids', cur, if_exists='append', index=False)
cur.executemany(
"INSERT INTO dbaasp_unusualaminoacids (id, position, beforeModification, note, modificationType_name, modificationType_description, r_id) "
"VALUES (%s, %s, %s, %s, %s, %s, %s)",
df_unusualAminoAcids.values.tolist())
# 11.hemo
for item_home in data['hemoliticCytotoxicActivities']:
if item_home['activityMeasureForLysisGroup'] is not None:
activityMeasureForLysisGroup_name = item_home['activityMeasureForLysisGroup']['name']
else:
continue
if item_home['targetCell'] is not None:
targetCell_name = item_home['targetCell']['name']
else:
continue
if item_home['unit'] is not None:
unit_name = item_home['unit']['name']
if 'description' in item_home['unit']:
try:
unit_description = item_home['unit']['description']
except:
unit_description = None
else:
unit_description = None
else:
unit_name = None
unit_description = None
# unit_description = item_home['unit'].get('description', None)
insert_query = '''INSERT INTO dbaasp_hemoly
(id, activityMeasureForLysisValue, concentration, ph, ionicStrength, saltType, note, reference, activity, targetCell_name, activityMeasureForLysisGroup_name, unit_name, unit_description,r_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) '''
values = (item_home['id'], item_home['activityMeasureForLysisValue'], item_home['concentration'],
item_home['ph'], item_home['ionicStrength'], item_home['saltType'], item_home['note'],
item_home['reference'],
item_home['activity'],
targetCell_name,
activityMeasureForLysisGroup_name,
unit_name, unit_description, id)
cur.execute(insert_query, values)
# 12.source
for item_source in data['sourceGenes']:
if item_source['geneInSequence'] is not None:
geneInSequence = item_source['geneInSequence'].get('name', None)
else:
geneInSequence = None
if item_source['dblink'] is not None:
try:
dblink_name = item_source['dblink']['name']
except:
dblink_name = None
try:
dblink_url = item_source['dblink']['url']
except:
dblink_url = None
else:
dblink_name = None
dblink_url = None
if item_source['orientation'] is not None:
orientation = item_source['orientation']['name']
else:
orientation = None
insert_query = '''INSERT INTO dbaasp_dbaaspr_sourceGenes (id,kingdom, source, subkingdom, plasmid, gene, chromLoc, orientation, geneInSequence, locus,note, dblink_name, dblink_url, r_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) '''
values = (
item_source['id'], item_source['kingdom']['name'], item_source['source'], item_source['subkingdom'],
item_source['plasmid'], item_source['gene'], item_source['chromLoc'], orientation,
geneInSequence, item_source['locus'], item_source['note'], dblink_name, dblink_url, id)
cur.execute(insert_query, values)
# 13.targetActivities
for item_target in data["targetActivities"]:
if type(item_target) == dict:
if item_target['targetSpecies'] is not None:
targetSpecies_name = item_target['targetSpecies']['name']
# targetSpecies_description = item_target['targetSpecies']['description']
targetSpecies_description = item_target['targetSpecies'].get('description', None)
else:
continue
else:
continue
if item_target['activityMeasureGroup'] is not None:
activityMeasureGroup_name = item_target['activityMeasureGroup']['name']
else:
continue
if item_target['unit'] is not None:
unit_name = item_target['unit']['name']
if 'description' in item_target['unit']:
try:
unit_description = item_target['unit']['description']
except:
unit_description = None
else:
unit_description = None
else:
unit_name = None
unit_description = None
if item_target['medium'] is not None:
medium_name = item_target['medium']['name']
medium_description = item_target['medium']['description']
else:
continue
cfuGroup = None # 设置默认值为None
if item_target['cfuGroup'] is not None:
cfuGroup = item_target['cfuGroup']['name']
targetSpecies_name = item_target['targetSpecies']['name']
insert_query = """INSERT INTO dbaasp_dbaaspr_targetActivities (id, targetSpecies_name, targetSpecies_description, activityMeasureGroup_name, activityMeasureValue,
concentration, unit_name, unit_description, ph, ionicStrength, saltType, medium_name, medium_description, cfu, cfuGroup, note, reference, activity, r_id)
VALUES (%s, %s, %s, %s, %s,%s, %s, %s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s) """
values = (item_target['id'], targetSpecies_name, targetSpecies_description, activityMeasureGroup_name,
item_target['activityMeasureValue'],
item_target['concentration'], unit_name, unit_description, item_target['ph'],
item_target['ionicStrength'],
item_target['saltType'], medium_name, medium_description, item_target['cfu'], cfuGroup,
item_target['note'], item_target['reference'], item_target['activity'], id)
cur.execute(insert_query, values)
# 14.antiActivities
for item_anti in data['antibiofilmActivities']:
activityMeasureGroup = item_anti.get('activityMeasureGroup')
if activityMeasureGroup is not None:
activityMeasureGroup_name = activityMeasureGroup.get('name')
activityMeasureGroup_description = activityMeasureGroup.get('description')
else:
activityMeasureGroup_name = None
activityMeasureGroup_description = None
unit = item_anti.get('unit')
if unit is not None:
unit_name = unit.get('name')
unit_description = unit.get('description')
else:
unit_name = None
unit_description = None
if item_anti['note'] is not None:
note = item_anti['note']
else:
note = None
cfuGroup = None # 设置默认值为None
if item_anti['cfuGroup'] is not None:
cfuGroup = item_anti['cfuGroup']['name']
targetSpecies_name = item_anti['targetSpecies']['name'] # 10635
medium_name = None
if item_anti.get('medium') is not None and item_anti['medium'].get('name') is not None:
medium_name = item_anti['medium']['name']
medium_description = None
if item_anti.get('medium') is not None and item_anti['medium'].get('description') is not None:
medium_description = item_anti['medium']['description']
cfu = None
if item_anti['cfu'] is not None:
cfu = item_anti['cfu']
reference = None
if item_anti['reference'] is not None:
reference = item_anti['reference']
peptideId = None
if item_anti['peptideId'] is not None:
peptideId = item_anti['peptideId']
activityMeasureValue = None
if item_anti['activityMeasureValue'] is not None:
activityMeasureValue = item_anti['activityMeasureValue']
concentration = None
if item_anti['concentration'] is not None:
concentration = item_anti['concentration']
insert_query = '''INSERT INTO dbaasp_antiActivities (id, targetSpecies_name, activityMeasureGroup_name, activityMeasureGroup_description,
activityMeasureValue, concentration, unit_name, unit_description, medium_name,
medium_description, cfu, cfuGroup, note, reference, peptideId, r_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
values = (item_anti['id'], targetSpecies_name, activityMeasureGroup_name,
activityMeasureGroup_description, activityMeasureValue,
concentration, unit_name, unit_description,
medium_name, medium_description, cfu, cfuGroup,
note, reference, peptideId, id)
cur.execute(insert_query, values)
# 提交更改并关闭连接
connection.commit()
cur.close()
connection.close()