最近我同学叫我帮忙处理一下数据,需要访问pkumdl网站:http://www.pkumdl.cn:8000/PSPredictor/,得到预测结果,我分析了一下他们的网站,然后写了一个爬虫请求,很简单,我分享出来给大家。
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from tqdm import tqdm
url = 'http://www.pkumdl.cn:8000/PSPredictor/all_scripts/runprediction.php'
def parse_html(in_seq,count=10):
csv_name=os.path.join('tables',str(count)+'.csv')
if(os.path.exists(csv_name)):
return
myobj = {'filenamedir': 'user_sess/2022_10_22_01_49_29pm_testuser','inseq': in_seq,"taskid":'1'}
results = requests.post(url, data = myobj)
text = eval(results.text)[0]
print(text)
soup = BeautifulSoup(text, "lxml")
tables=soup.find_all('table')
for i in range(len(tables)):
df_tables=pd.read_html(str(tables[i]))
for j in range(len(df_tables)):
df=df_tables[j]
df.to_csv(csv_name,index=False,header=False)
with open('output.txt','r') as f:
in_seqs = f.readlines()
count = 0
frenquency = 100
in_seq=""
flag=False
for i in tqdm(range(0,len(in_seqs),2)):
if(in_seqs[i].strip()=='>OsMH_05T0333800.1'):
flag=True
count+=1
if(flag):
in_seq +=in_seqs[i]+in_seqs[i+1]
if(count %frenquency==0):
parse_html(in_seq,count=count)
in_seq=""
有人会有疑问,output.txt里面是啥东西,我分享出来:
>OsMH_01T0000100.1
MPPVQSPHPIRLPNKRSRQADKPLNQGVLAEEALDPRRSYGLPAHQAASHNQLTRSHASSDFNLPSFPGAISCSSTAREMSSAAGQDNGDTAGDYIKWMCGAGGRAGGAMANLQRGVGSLVRDIGDPCLNPSPVKGSKMLKPEKWHTCFDNDGKVIGFRKALKFIVLGGVDPTIRAEVWEFLLGCYALSSTSEYRRKLRAVRSARACTQALVQDYNDMGEPRYDTETFDDYPSLPVTNFFSTDGVGSNGVDKNHCSFSVPEDRLRHRDEHSLRISDAPEADFVDGTKSNSVVASKDRVSEWLWTLHRIVVDVVRTDSHLDFYGESRNMARMSDILAVYAWVDPSTGYCQGMSDLLSPFVVLYEDDADAFWCFEMLLRRMRENFQMEGPTGVMKQLQALWKIMEITDVELFEHLSTIGAESLHFAFRMLLVLFRRELSFEESLSMWENCLEPLLVDMRNDLSCEVKEEHRVNSYTRRKSKSRKPHHRNGEMRVACNLGMKPNTRNPLCGLSGATIWARHQQMPHISTNVLAKNGDDDLPIFCVAAILMFNDNMLKINVKRCVRMAIKLRKKGARNDEKVETEVEKSPWGTRCFSTMNFSESVHMHSILENKRRSGGRIIEKTAMLEAGERDADETHAWSGTASPAALWKTVASSAAMLKLALAMISAAFRTTPFSMSMQLCPNATMSLHSPSIFDVVSSITPIMSCIINNRRKKKRKGKGKVLGLTPTCGWSCCTPVPASLHSPSPPTLTSASNSNSNPMDSIRRRSAGGILGILFLVLLRWAGAGDPYAYYEWEVSYVWGAPLGGVKKQEAIGINGQLPGPALNVTTNWNLVVNVRNGLDEPLLLTWHGVQQRKSPWQDGVGGTNCGIPPGWNWTYQFQVKDQVGSFFYAPSTALHRAAGGYGAITINNRDVIPLPFPLPDGGDITLFLADWYARDHRALRRALDAGDPLGPPDGVLINALGPYRYNDTLVPPGVTYERINVDPGRTYRLRVHNVGVATSLNFRIQGHNLLLVEAEGSYTSQQNYTNMDIHVGQSYSFLLTMDQNASTDYYVVASARFVPDADKLTGVAILHYSNSQGPPSGSLPDAPDDQYDTAFSINQARSIRWNVTASGARPNPQGSFHYGDITVTDVYLLQSMPPELIDGQMRATLNGISYIAPSTPLMLAQLFNVPGVYKLDFPNRPMNRLPKLDTSIINGTYKGFMEIIFQNNATSVQSYHLDGYAFFVVGMDYGLWTDNSRGTYNKWDGVARSTIQVFPGAWTAVLVFLDNAGIWNLRVENLDAWYLGQEVYISVVNPEDSSNKTVLPLPDNAIFCGALSSLQKEQSHRFQYSEASQITQLWKMVFFMAWLALW
>OsMH_01T0000100.2
MPPVQSPHPIRLPNKRSRQADKPLNQGVLAEEALDPRRSYGLPAHQAASHNQLTRSHASSDFNLPSFPGAISCSSTAREMSSAAGQDNGDTAGDYIKWMCGAGGRAGGAMANLQRGVGSLVRDIGDPCLNPSPVKGSKMLKPEKWHTCFDNDGKVIGFRKALKFIVLGGVDPTIRAEVWEFLLGCYALSSTSEYRRKLRAVRREKYQILVRQCQSMHPSIGTGELAYAVGSKLMDVRTMSKETHIAEEVSTSQQTSQNTAGSLVEDSDYGPGGAQQSQKRESCSKSAELVGFNVHNDTSLYDSSNFIVSSTEVNNCSKDSQDYNDMGEPRYDTETFDDYPSLPVTNFFSTDGVGSNGVDKNHCSFSVPEDRLRHRDERMHSFQINNNIDLIIESNSCSSDVFRASNSDSAIFHSDAYKQDRWLDDNGYNREVIDSLRISDAPEADFVDGTKSNSVVASKDRVSEWLWTLHRIVVDVVRTDSHLDFYGESRNMARMSDILAVYAWVDPSTGYCQGMSDLLSPFVVLYEDDADAFWCFEMLLRRMRENFQMEGPTGVMKQLQALWKIMEITDVELFEHLSTIGAESLHFAFRMLLVLFRRELSFEESLSMWENCLEPLLVDMRNDLSCEVKEEHRVNSYTRRKSKSRKPHHRNGEMRVACNLGMKPNTRNPLCGLSGATIWARHQQMPHISTNVLAKNGDDDLPIFCVAAILMFNDNMLKINVKRCVRMAIKLRKKGARNDEKVETEVEKSPWGTRCFSTMNFSESVHMHSILENKRRSGGRIIEKTAMLEAGERDADETHAWSGTASPAALWKTVASSAAMLKLALAMISAAFRTTPFSMSMQLCPNATMSLHSPSIFDVVSSITPIMSCIINNRRKKKRKGKGKVLGLTPTCGWSCCTPVPASLHSPSPPTLTSASNSNSNPMDSIRRRSAGGILGILFLVLLRWAGAGDPYAYYEWEVSYVWGAPLGGVKKQEAIGINGQLPGPALNVTTNWNLVVNVRNGLDEPLLLTWHGVQQRKSPWQDGVGGTNCGIPPGWNWTYQFQVKDQVGSFFYAPSTALHRAAGGYGAITINNRDVIPLPFPLPDGGDITLFLADWYARDHRALRRALDAGDPLGPPDGVLINALGPYRYNDTLVPPGVTYERINVDPGRTYRLRVHNVGVATSLNFRIQGHNLLLVEAEGSYTSQQNYTNMDIHVGQSYSFLLTMDQNASTDYYVVASARFVPDADKLTGVAILHYSNSQGPPSGSLPDAPDDQYDTAFSINQARSIRWNVTASGARPNPQGSFHYGDITVTDVYLLQSMPPELIDGQMRATLNGISYIAPSTPLMLAQLFNVPGVYKLDFPNRPMNRLPKLDTSIINGTYKGFMEIIFQNNATSVQSYHLDGYAFFVVGMDYGLWTDNSRGTYNKWDGVARSTIQVFPGAWTAVLVFLDNAGIWNLRVENLDAWYLGQEVYISVVNPEDSSNKTVLPLPDNAIFCGALSSLQKEQSHRFQYSEASQITQLWKMVFFMAWLALW
......
其实就是这样的基因序列哈。