目前还是一个比较基础的版本,速度试了一下尚可,先记录一下代码,最近会再完善并增加功能
中间拦截的数据可以用json.dumps规整之后自己记录下来看看,结构很清晰
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
async def handle_json(json_data):
global tl
if isinstance(json_data, dict):
tl.append(json_data)
elif isinstance(json_data, list):
tl.extend(json_data)
async def handle(gene,request, response):
global geneDescs
global variants
if response is not None:
if gene in response.url:
try:
# 将响应的JSON数据传递给handle_json函数处理
await handle_json(await response.json())
except:
print('----------------------------')
for tll in tl:
if "description" in tll:
geneDesc ={'gene':tll["gene"]["hugoSymbol"],"description":tll["description"]}
geneDescs.append(geneDesc)
elif "mutationEffectDescription" in tll:
result_dict = {
"gene":tll["variant"]["gene"]["hugoSymbol" ],
"alteration": tll["variant"]["alteration"],
"mutationEffect": tll["mutationEffect"],
"oncogenic": tll["oncogenic"],
"mutationEffectDescription": tll["mutationEffectDescription"],
}
variants.append(result_dict)
async def parse_page(gene, browser):
context = await browser.new_context()
page = await context.new_page()
page.on("request", lambda request: handle(gene, request=request, response=None))
page.on("response", lambda response: handle(gene, response=response, request=None))
#page.goto(url, timeout) 方法中的 timeout 参数定义了页面导航到指定 URL 的最大允许时间。
#如果在指定的超时时间内页面未能成功导航到目标 URL(例如,由于网络延迟、服务器响应缓慢或页面加载时间过长),则 Playwright 会抛出一个 TimeoutError。
await page.goto(f'https://www.oncokb.org/gene/{gene}',timeout=240000)
print(f'https://www.oncokb.org/gene/{gene}')
#page.wait_for_timeout(timeout) 方法会导致页面等待指定的超时时间,然后执行后续的操作。
#这个方法不会导航到任何新的 URL,也不会等待页面上的任何特定事件。它只是简单地让程序暂停指定的时间。
await page.wait_for_timeout(8000)
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch()
genes = ['ABL1', 'AKT1', 'ALK', 'AMER1', 'APC', 'AR', 'ARID1A', 'ASXL1', 'ATM', 'ATRX', 'AXIN1'] #用A开头的基因示范一下
tasks = [parse_page(gene, browser) for gene in genes]
# 使用await关键字和asyncio.gather函数来并发执行所有任务,并等待它们全部完成
await asyncio.gather(*tasks)
await browser.close()
def run():
# 运行主函数main,启动整个异步程序
asyncio.run(main())
df_g=pd.DataFrame(geneDescs)
df_gu=df_g.drop_duplicates()
# 先按照a列分组,然后对b列的值按长度排序,最后合并字符串
df_me = (df_gu.groupby('gene')['description']
.apply(lambda x: ' '.join(sorted(x, key=len))) # 使用sorted按长度排序
.reset_index())
df_me.to_excel(r"D:\Genes.xlsx", index=False)
df = pd.DataFrame(variants)
df_unique = df.drop_duplicates()
df_unique.to_excel(r"D:\Variants.xlsx", index=False)
if __name__=='__main__':
tl = []
geneDescs = []
variants = []
run()