工作文件:
1.autoget.sh
2.autoget.py
3.orgin_list.txt 第三列包含ncbi登录号
运行autoget.sh
autoget.sh
#!/bin/bash
>list.txt
>res.txt
cat orgin_list.txt |awk '{print $3}' |sort |uniq>list.txt ##orgin_list.txt第三列为ncbi登录号,可以根据自己的数据修改这一行命令
while [-s list.txt]
do
cat list.txt |while read file ;
do
python autoget.py $file | tee -a res.txt ;
done
>list.bad
cat list.txt |while read file ;
do
grep $file res.txt ;
if
[ $? -ne 0 ];then echo $file >>list.bad;
fi;
done
cp -p list.bad list.txt
done
cat res.txt |while read code id; do curl "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=${id}&db=nuccore&report=fasta&extrafeat=null&conwithfeat=on&hide-cdd=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=100000000" -o $code.fasta;
done
>orgin_list.txt
rm -rf list.txt res.txt list.bad
autoget.py
##python3.8
import sys
import asyncio
from pyppeteer import launch
from pyquery import PyQuery as pq
from lxml import etree
item = sys.argv[1]
baseUrl = "https://www.ncbi.nlm.nih.gov/nuccore/{}?report=fasta".format(item)
async def main():
browser = await launch()
page = await broswer.newPage()
await page.goto(baseUrl,{'timeout': 10000*6})
Html = pg(await page.content()).html()
htmldata_data = etree.HTML(Html).xpath('/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[5]/div[2]/div[1]/@val')
link_id = html_data[0]
await browser.close()
print(item,link_id)
asyncio.get_envent_loop().run_until_complete(main())