美赛论文成绩批量爬取
主要使用了requests的包进行了简单的爬取,关键是美赛论文组没有对其进行防爬取处理,不然的话,估计就难了.
代码主要是从2127310号进行爬取的,然后爬取了50组的数据,经查看是完全可行的。
import requests
import time
import re
import random
import urllib
path = r'E:\美赛\ '
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/73.0.3683.103 Safari/537.36'
}
def get_urls():
str1=2127310
a=1;
url_list=[]
while(a<50):
str2=str(a+str1)
a+=1
url = "https://www.comap-math.com/mcm/2021Certs/"+str2+".pdf"
url_list.append(url)
return url_list
def get_text(url):
res = requests.get(url, headers=headers)
res.encoding = res.apparent_encoding
try:
with open(path + f'{name}.pdf', 'wb') as f:
f.write(res.content)
print(f'{name} 下载完成')
except Exception:
print("Exception")
name = 10
urls = get_urls()
for url in urls:
print(url)
name+=1
get_text(url)
time.sleep(random.randint(1, 3))