环境:macos+Python3.9(Windows版本仅需更改目录)
效果图:
代码:
没有写多线程,按需更改range()或者多个文件一起运行。
1.neuqoj
import requests
from bs4 import BeautifulSoup
import time,os,re
import json
def write_in_file(f,string):#output function
with open ('/Users/cyh/Desktop/acm/neuqacm/'+f+'/'+f+".txt","a+",encoding='utf-8') as fi:
fi.write(string)
fi.write("\n")
fi.close()
link = "http://140.143.222.61:8088/problem/"
link2="http://newoj.acmclub.cn/problems/"
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' ,
'accept-language': 'zh-CN,zh'
}
for i in range (1002,1003):
try:
print("开始",i)
r = requests.get(link+str(i),headers = headers,timeout = 100)
j=r.json()
# print(j)
problem_title=j['data']['title']
if("/" in problem_title):
problem_title=problem_title.replace("/", "比")
if not(os.path.exists('/Users/cyh/Desktop/acm/neuqacm/'+str(i)+problem_title+'/')):
os.mkdir('/Users/cyh/Desktop/acm/neuqacm/'+str(i)+problem_title)
write_in_file(str(i)+problem_title,"question: "+problem_title+"\n")
problem_des = [j['data']['difficulty'],j['data']['input'],j['data']['output'],j['data']['sample_input'],j['data']['sample_output']]
the_title =['难度','输入描述','输出描述','样例输入','样例输出']
print("写入"+str(i) +" file")
j['data']['description']=j['data']['description'].replace('<div align="left"><span style="font-size: medium">', ' ')
j['data']['description']=j['data']['description'].replace('<font color="#000000">','')
j['data']['description']=j['data']['description'].replace('<span style="font-size: medium">','')
j['data']['description']=j['data']['description'].replace('<span style="font-size: small">','')
j['data']['description']=j['data']['description'].replace('<span>', ' ')
j['data']['description']=j['data']['description'].replace('</span>', ' ')
j['data']['description']=j['data']['description'].replace('''<p><style type="text/css">p { margin-bottom: 0.21cm; }</style>
''', ' ')
j['data']['description']=j['data']['description'].replace('<p style="margin-bottom: 0cm;"><font color="#000000">', ' ')
j['data']['description']=j['data']['description'].replace("<p>",' ')
j['data']['description']=j['data']['description'].replace("</p>",' ')
j['data']['description']=j['data']['description'].replace("<font>",' ')
j['data']['description']=j['data']['description'].replace("</font>",' ')
j['data']['description']=j['data']['description'].replace("<br />",' ')
j['data']['description']=j['data']['description'].replace("""<style type="text/css">p { margin-bottom: 0.21cm; }</style>""",' ')
j['data']['description']=j['data']['description'].replace(' ', '')
print(j['data']['description'])
len_of_the_title = len(the_title)
write_in_file(str(i)+problem_title,'题目描述'+":\n"+j['data']['description']+"\n")
for m in range(0,len_of_the_title):
write_in_file(str(i)+problem_title,the_title[m]+":\n"+str(problem_des[m])+"\n")
print("done")
except:
print("跳过")
2.hduacm
import requests
from bs4 import BeautifulSoup
import time,os
def write_in_file(f,string):#output function
with open ('/Users/cyh/Desktop/acm/hduacm/'+f+'/'+f+".txt","a+",encoding='utf-8') as fi:
fi.write(string)
fi.close()
link = "http://acm.hdu.edu.cn/showproblem.php?pid="
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
for i in range (6937,6939):
print("开始",i)
r = requests.get(link+str(i),headers = headers,timeout = 100)
print("OK")
soup = BeautifulSoup(r.text,"lxml")
problem_title = soup.find("h1").text#get the title
if("/" in problem_title):
problem_title=problem_title.replace("/", "比")
if not(os.path.exists('/Users/cyh/Desktop/acm/hduacm/'+str(i)+problem_title+'/')):
os.mkdir('/Users/cyh/Desktop/acm/hduacm/'+str(i)+problem_title)
write_in_file(str(i)+problem_title,"question: "+problem_title+"\n")
problem_des = soup.find_all("div",class_="panel_content")
the_title = soup.find_all("div",class_ ="panel_title")
#print(the_title)
print("写入"+str(i) +" file")
len_of_the_title = len(the_title)
for m in range(0,len_of_the_title):
write_in_file(str(i)+problem_title,the_title[m].text+": "+problem_des[m].text+"\n")
print("done")
3.pkuacm
import requests
from bs4 import BeautifulSoup
import time,os,re
from lxml import etree
def write_in_file(f,string):#output function
with open ('/Users/cyh/Desktop/acm/pkuacm/'+f+'/'+f+".html","a+",encoding='utf-8') as fi:
fi.write(string)
fi.close()
link = "http://poj.org/problem?id="
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
count=[0,0]
for i in range (2577,3000):
try:
print("开始",i)
t='&lang=zh-CN&change=true'
r = requests.get(link+str(i)+t,headers = headers,timeout = 100)
r=r.content
print("OK")
c=etree.HTML(r,parser=etree.HTMLParser())
# //html/body/table[]/tbody/tr/td/div[2] /html/body/table/tbody/tr/td/div[2]<div class="ptt" lang="zh-CN"></div>
d=c.xpath("/html/body/table[2]")
e=c.xpath('/html/body/table[2]/tr/td/div[2]')
problem_title=etree.tostring(e[0],encoding='utf-8').decode('utf-8').replace("</div> ",'').replace('<div class="ptt" lang="zh-CN">', '')
print(problem_title)
content=etree.tostring(d[0],encoding='utf-8').decode('utf-8')
# print(etree.tostring(c, pretty_print=True).decode("utf-8"))
if("/" in problem_title):
problem_title=problem_title.replace("/", "比")
if not(os.path.exists('/Users/cyh/Desktop/acm/pkuacm/'+str(i)+problem_title+'/')):
os.mkdir('/Users/cyh/Desktop/acm/pkuacm/'+str(i)+problem_title.strip('\n'))
write_in_file(str(i)+problem_title.strip('\n'), content)
count[0]+=1
except:
count[1]+=1
print("pass:",count[1])
print("完成",count[0])