爬取三个acm网站题库（neuqoj pku hdu）

最新推荐文章于 2021-11-11 17:26:04 发布

爱学编程的cyh

最新推荐文章于 2021-11-11 17:26:04 发布

阅读量625

点赞数

分类专栏： Python 文章标签： python xpath acm竞赛算法爬虫

代码由爱学习的cyh编写

本文链接：https://blog.csdn.net/m0_46533133/article/details/118387476

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

环境：macos+Python3.9（Windows版本仅需更改目录）

效果图：

代码：

没有写多线程，按需更改range（）或者多个文件一起运行。

1.neuqoj

import requests
from bs4 import BeautifulSoup
import time,os,re
import json
def write_in_file(f,string):#output function
	with open ('/Users/cyh/Desktop/acm/neuqacm/'+f+'/'+f+".txt","a+",encoding='utf-8') as fi:
		fi.write(string)
		fi.write("\n")
		fi.close()
		
		
link = "http://140.143.222.61:8088/problem/"
link2="http://newoj.acmclub.cn/problems/"
headers = {
	'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' ,
	'accept-language': 'zh-CN,zh'
}
for i in range (1002,1003):
	try:
		print("开始",i)
		r = requests.get(link+str(i),headers = headers,timeout = 100)
		j=r.json()
	#	print(j)
		problem_title=j['data']['title']
		if("/" in problem_title):
			problem_title=problem_title.replace("/", "比")
		if not(os.path.exists('/Users/cyh/Desktop/acm/neuqacm/'+str(i)+problem_title+'/')):
			os.mkdir('/Users/cyh/Desktop/acm/neuqacm/'+str(i)+problem_title)
		write_in_file(str(i)+problem_title,"question: "+problem_title+"\n")
		problem_des = [j['data']['difficulty'],j['data']['input'],j['data']['output'],j['data']['sample_input'],j['data']['sample_output']]
		the_title =['难度','输入描述','输出描述','样例输入','样例输出']
		print("写入"+str(i) +" file")
		j['data']['description']=j['data']['description'].replace('<div align="left"><span style="font-size: medium">', ' ')
		j['data']['description']=j['data']['description'].replace('<font color="#000000">','')
		j['data']['description']=j['data']['description'].replace('<span style="font-size: medium">','')
		j['data']['description']=j['data']['description'].replace('<span style="font-size: small">','')
		j['data']['description']=j['data']['description'].replace('<span>', ' ')
		j['data']['description']=j['data']['description'].replace('</span>', ' ')
		j['data']['description']=j['data']['description'].replace('''<p><style type="text/css">p { margin-bottom: 0.21cm; }</style>
			''', ' ')
		j['data']['description']=j['data']['description'].replace('<p style="margin-bottom: 0cm;"><font color="#000000">', ' ')
		j['data']['description']=j['data']['description'].replace("<p>",' ')
		j['data']['description']=j['data']['description'].replace("</p>",' ')
		j['data']['description']=j['data']['description'].replace("<font>",' ')
		j['data']['description']=j['data']['description'].replace("</font>",' ')
		j['data']['description']=j['data']['description'].replace("<br />",' ')
		j['data']['description']=j['data']['description'].replace("""<style type="text/css">p { margin-bottom: 0.21cm; }</style>""",' ')
		j['data']['description']=j['data']['description'].replace('&nbsp;', '')

		print(j['data']['description'])
		len_of_the_title = len(the_title)
		write_in_file(str(i)+problem_title,'题目描述'+":\n"+j['data']['description']+"\n")
		for m in range(0,len_of_the_title):
			write_in_file(str(i)+problem_title,the_title[m]+":\n"+str(problem_des[m])+"\n")
		print("done")
	except:
		print("跳过")

2.hduacm

import requests
from bs4 import BeautifulSoup
import time,os

def write_in_file(f,string):#output function
	with open ('/Users/cyh/Desktop/acm/hduacm/'+f+'/'+f+".txt","a+",encoding='utf-8') as fi:
		fi.write(string)
		fi.close()
		
		
link = "http://acm.hdu.edu.cn/showproblem.php?pid="
headers = {
	'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'    
}
for i in range (6937,6939):
	
	print("开始",i)
	r = requests.get(link+str(i),headers = headers,timeout = 100)
	print("OK")
	soup = BeautifulSoup(r.text,"lxml")
	problem_title = soup.find("h1").text#get the title
	if("/" in problem_title):
		problem_title=problem_title.replace("/", "比")
	if not(os.path.exists('/Users/cyh/Desktop/acm/hduacm/'+str(i)+problem_title+'/')):
		os.mkdir('/Users/cyh/Desktop/acm/hduacm/'+str(i)+problem_title)
	write_in_file(str(i)+problem_title,"question: "+problem_title+"\n")
	problem_des = soup.find_all("div",class_="panel_content") 
	the_title = soup.find_all("div",class_ ="panel_title")
	#print(the_title)
	print("写入"+str(i) +" file")
	len_of_the_title = len(the_title)
	
	for m in range(0,len_of_the_title):
		write_in_file(str(i)+problem_title,the_title[m].text+": "+problem_des[m].text+"\n")
	print("done")

3.pkuacm

import requests
from bs4 import BeautifulSoup
import time,os,re
from lxml import etree


def write_in_file(f,string):#output function
	with open ('/Users/cyh/Desktop/acm/pkuacm/'+f+'/'+f+".html","a+",encoding='utf-8') as fi:
		fi.write(string)
		fi.close()
		
		
link = "http://poj.org/problem?id="
headers = {
	'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'    
}
count=[0,0]
for i in range (2577,3000):
	try:
		print("开始",i)
		t='&lang=zh-CN&change=true'
		r = requests.get(link+str(i)+t,headers = headers,timeout = 100)
		r=r.content
		print("OK")
		c=etree.HTML(r,parser=etree.HTMLParser())	
	#	//html/body/table[]/tbody/tr/td/div[2]    /html/body/table/tbody/tr/td/div[2]<div class="ptt" lang="zh-CN"></div>&#13;
		d=c.xpath("/html/body/table[2]")
		e=c.xpath('/html/body/table[2]/tr/td/div[2]')
		problem_title=etree.tostring(e[0],encoding='utf-8').decode('utf-8').replace("</div>&#13;",'').replace('<div class="ptt" lang="zh-CN">', '')
		print(problem_title)
		content=etree.tostring(d[0],encoding='utf-8').decode('utf-8')
	#	print(etree.tostring(c, pretty_print=True).decode("utf-8"))
		if("/" in problem_title):
			problem_title=problem_title.replace("/", "比")
		if not(os.path.exists('/Users/cyh/Desktop/acm/pkuacm/'+str(i)+problem_title+'/')):
			os.mkdir('/Users/cyh/Desktop/acm/pkuacm/'+str(i)+problem_title.strip('\n'))
		write_in_file(str(i)+problem_title.strip('\n'), content)
		count[0]+=1
	except:
		count[1]+=1
		print("pass:",count[1])
print("完成",count[0])

爱学编程的cyh

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬取三个acm网站题库（neuqoj pku hdu）

环境：macos+Python3.9（Windows版本仅需更改目录）效果图：代码：没有写多线程，按需更改range（）或者多个文件一起运行。1.neuqojimport requestsfrom bs4 import BeautifulSoupimport time,os,reimport jsondef write_in_file(f,string):#output function with open ('/Users/cyh/Desktop/acm/neu
复制链接

扫一扫

专栏目录