爬取三个acm网站题库(neuqoj pku hdu)

环境:macos+Python3.9(Windows版本仅需更改目录)

效果图:

代码:

没有写多线程,按需更改range()或者多个文件一起运行。

1.neuqoj

import requests
from bs4 import BeautifulSoup
import time,os,re
import json
def write_in_file(f,string):#output function
	with open ('/Users/cyh/Desktop/acm/neuqacm/'+f+'/'+f+".txt","a+",encoding='utf-8') as fi:
		fi.write(string)
		fi.write("\n")
		fi.close()
		
		
link = "http://140.143.222.61:8088/problem/"
link2="http://newoj.acmclub.cn/problems/"
headers = {
	'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' ,
	'accept-language': 'zh-CN,zh'
}
for i in range (1002,1003):
	try:
		print("开始",i)
		r = requests.get(link+str(i),headers = headers,timeout = 100)
		j=r.json()
	#	print(j)
		problem_title=j['data']['title']
		if("/" in problem_title):
			problem_title=problem_title.replace("/", "比")
		if not(os.path.exists('/Users/cyh/Desktop/acm/neuqacm/'+str(i)+problem_title+'/')):
			os.mkdir('/Users/cyh/Desktop/acm/neuqacm/'+str(i)+problem_title)
		write_in_file(str(i)+problem_title,"question: "+problem_title+"\n")
		problem_des = [j['data']['difficulty'],j['data']['input'],j['data']['output'],j['data']['sample_input'],j['data']['sample_output']]
		the_title =['难度','输入描述','输出描述','样例输入','样例输出']
		print("写入"+str(i) +" file")
		j['data']['description']=j['data']['description'].replace('<div align="left"><span style="font-size: medium">', ' ')
		j['data']['description']=j['data']['description'].replace('<font color="#000000">','')
		j['data']['description']=j['data']['description'].replace('<span style="font-size: medium">','')
		j['data']['description']=j['data']['description'].replace('<span style="font-size: small">','')
		j['data']['description']=j['data']['description'].replace('<span>', ' ')
		j['data']['description']=j['data']['description'].replace('</span>', ' ')
		j['data']['description']=j['data']['description'].replace('''<p><style type="text/css">p { margin-bottom: 0.21cm; }</style>
			''', ' ')
		j['data']['description']=j['data']['description'].replace('<p style="margin-bottom: 0cm;"><font color="#000000">', ' ')
		j['data']['description']=j['data']['description'].replace("<p>",' ')
		j['data']['description']=j['data']['description'].replace("</p>",' ')
		j['data']['description']=j['data']['description'].replace("<font>",' ')
		j['data']['description']=j['data']['description'].replace("</font>",' ')
		j['data']['description']=j['data']['description'].replace("<br />",' ')
		j['data']['description']=j['data']['description'].replace("""<style type="text/css">p { margin-bottom: 0.21cm; }</style>""",' ')
		j['data']['description']=j['data']['description'].replace('&nbsp;', '')

		print(j['data']['description'])
		len_of_the_title = len(the_title)
		write_in_file(str(i)+problem_title,'题目描述'+":\n"+j['data']['description']+"\n")
		for m in range(0,len_of_the_title):
			write_in_file(str(i)+problem_title,the_title[m]+":\n"+str(problem_des[m])+"\n")
		print("done")
	except:
		print("跳过")

2.hduacm

import requests
from bs4 import BeautifulSoup
import time,os

def write_in_file(f,string):#output function
	with open ('/Users/cyh/Desktop/acm/hduacm/'+f+'/'+f+".txt","a+",encoding='utf-8') as fi:
		fi.write(string)
		fi.close()
		
		
link = "http://acm.hdu.edu.cn/showproblem.php?pid="
headers = {
	'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'    
}
for i in range (6937,6939):
	
	print("开始",i)
	r = requests.get(link+str(i),headers = headers,timeout = 100)
	print("OK")
	soup = BeautifulSoup(r.text,"lxml")
	problem_title = soup.find("h1").text#get the title
	if("/" in problem_title):
		problem_title=problem_title.replace("/", "比")
	if not(os.path.exists('/Users/cyh/Desktop/acm/hduacm/'+str(i)+problem_title+'/')):
		os.mkdir('/Users/cyh/Desktop/acm/hduacm/'+str(i)+problem_title)
	write_in_file(str(i)+problem_title,"question: "+problem_title+"\n")
	problem_des = soup.find_all("div",class_="panel_content") 
	the_title = soup.find_all("div",class_ ="panel_title")
	#print(the_title)
	print("写入"+str(i) +" file")
	len_of_the_title = len(the_title)
	
	for m in range(0,len_of_the_title):
		write_in_file(str(i)+problem_title,the_title[m].text+": "+problem_des[m].text+"\n")
	print("done")

3.pkuacm

import requests
from bs4 import BeautifulSoup
import time,os,re
from lxml import etree


def write_in_file(f,string):#output function
	with open ('/Users/cyh/Desktop/acm/pkuacm/'+f+'/'+f+".html","a+",encoding='utf-8') as fi:
		fi.write(string)
		fi.close()
		
		
link = "http://poj.org/problem?id="
headers = {
	'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'    
}
count=[0,0]
for i in range (2577,3000):
	try:
		print("开始",i)
		t='&lang=zh-CN&change=true'
		r = requests.get(link+str(i)+t,headers = headers,timeout = 100)
		r=r.content
		print("OK")
		c=etree.HTML(r,parser=etree.HTMLParser())	
	#	//html/body/table[]/tbody/tr/td/div[2]    /html/body/table/tbody/tr/td/div[2]<div class="ptt" lang="zh-CN"></div>&#13;
		d=c.xpath("/html/body/table[2]")
		e=c.xpath('/html/body/table[2]/tr/td/div[2]')
		problem_title=etree.tostring(e[0],encoding='utf-8').decode('utf-8').replace("</div>&#13;",'').replace('<div class="ptt" lang="zh-CN">', '')
		print(problem_title)
		content=etree.tostring(d[0],encoding='utf-8').decode('utf-8')
	#	print(etree.tostring(c, pretty_print=True).decode("utf-8"))
		if("/" in problem_title):
			problem_title=problem_title.replace("/", "比")
		if not(os.path.exists('/Users/cyh/Desktop/acm/pkuacm/'+str(i)+problem_title+'/')):
			os.mkdir('/Users/cyh/Desktop/acm/pkuacm/'+str(i)+problem_title.strip('\n'))
		write_in_file(str(i)+problem_title.strip('\n'), content)
		count[0]+=1
	except:
		count[1]+=1
		print("pass:",count[1])
print("完成",count[0])

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值