最近在学习C++,爬下一些题来,用于平时练手。
原理其实也很简单,就是生成页面链接,下载网页,提取内容,保存为xml。
用到正则表达式来匹配,minidom来生成xml文件。
(相关URL已更改,题库将上传到sae的网页)
#coding:utf8
# c++ exercises crawler by bosshida,2014.1.2
import urllib2
import re
import string
from xml.dom import minidom
# regex pattern for problem page
pNamePattern = re.compile("<h2.*?>(.*?)</h2>",re.S)
# regex pattern for problem description
pDescPattern = re.compile('<td.*?background="srcs/bg_mid.gif".*?>(.*?)</td>',re.S)
# regex pattern for input, output, sampleInput, sampleOutput
pPrePattern = re.compile("<pre.*?>(.*?)</pre>",re.S)
# generate problem set page url
def genProblemSetUrl(count):
pSetUrl = "http://xxx.edu.cn:8080/JudgeOnline/problemset.jsp?vol=%d" % count
#pSetUrl = "/Users/apple/Dropbox/pysrc/problem%d" % count
#pSetUrl = pSetUrl + ".htm"
return pSetUrl
# get next problem url by regex
def getAllProblemUrl(page):
pUrls = re.findall('<a href="(problem.jsp\?id=\d+)">',page, re.S)
return pUrls
# download page content
def loadPage(url):
response = urllib2.urlopen(url)
content = response.read()
return content
# get problem name
def getProblemName(page):
pNameMatch = pNamePattern.search(page)
return pNameMatch.group(1) if pNameMatch!=None else None
# get problem Description
def getProblemDesc(page):
pDescMatch = pDescPattern.search(page)
return pDescMatch.group(1) if pDescMatch!=None else None
# get input content
def getInputContent(page):
pInput = pPrePattern.findall(page)[0]
return pInput
# get output content
def getOutputContent(page):
pOutput = pPrePattern.findall(page)[1]
return pOutput
# get sample input
def getSampleInput(page):
pSampleInput = pPrePattern.findall(page)[2]
return pSampleInput
# get sample output
def getSampleOutput(page):
pSampleOutput = pPrePattern.findall(page)[3]
return pSampleOutput
# class for xml
class XmlGenerator:
def __init__(self, xmlName):
self.doc = minidom.Document()
self.xmlName = xmlName
def createNode(self, nodeName):
return self.doc.createElement(nodeName)
def addNode(self, node, prevNode = None):
curNode = node
if prevNode is not None:
prevNode.appendChild(curNode)
else:
self.doc.appendChild(curNode)
return curNode
def setNodeAttr(self, node, attName, value):
curNode = node
curNoee.setAttribute(attName, value)
def setNodeValue(self, curNode, value):
nodeData = self.doc.createTextNode(value)
curNode.appendChild(nodeData)
def genXml(self):
f = open(self.xmlName, "w")
f.write(self.doc.toprettyxml(indent="\t", newl="\n", encoding="utf8"))
f.close()
path = "D://test.xml"
xmlGen = XmlGenerator(path)
rootNode = xmlGen.createNode("root")
xmlGen.addNode(node=rootNode)
count = 0
while count < 6:
pSetUrl = genProblemSetUrl(count+1)
print pSetUrl
html = loadPage(pSetUrl)
for pageUrl in getAllProblemUrl(html):
pageUrl = "http://xxx.edu.cn:8080/JudgeOnline/" + pageUrl
print pageUrl
pPage = loadPage(pageUrl)
pPage_gbk = unicode(pPage, "gbk")
pName = getProblemName(pPage_gbk)
pDesc = getProblemDesc(pPage_gbk)
pInput = getInputContent(pPage_gbk)
pOutput = getOutputContent(pPage_gbk)
pSampleInput = getSampleInput(pPage_gbk)
pSampleOutput = getSampleOutput(pPage_gbk)
problemNode = xmlGen.createNode("problem")
pUrlNode = xmlGen.createNode("url")
xmlGen.setNodeValue(pUrlNode, pageUrl)
xmlGen.addNode(pUrlNode, problemNode)
pNameNode = xmlGen.createNode("name")
xmlGen.setNodeValue(pNameNode, pName)
xmlGen.addNode(pNameNode, problemNode)
pDescNode = xmlGen.createNode("desc")
xmlGen.setNodeValue(pDescNode, pDesc)
xmlGen.addNode(pDescNode, problemNode)
pInputNode = xmlGen.createNode("input")
xmlGen.setNodeValue(pInputNode, pInput)
xmlGen.addNode(pInputNode, problemNode)
pOutputNode = xmlGen.createNode("output")
xmlGen.setNodeValue(pOutputNode, pOutput)
xmlGen.addNode(pOutputNode, problemNode)
pSInputNode = xmlGen.createNode("sampleInput")
xmlGen.setNodeValue(pSInputNode, pSampleInput)
xmlGen.addNode(pSInputNode, problemNode)
pSOutputNode = xmlGen.createNode("sampleOutput")
xmlGen.setNodeValue(pSOutputNode, pSampleOutput)
xmlGen.addNode(pSOutputNode, problemNode)
xmlGen.addNode(problemNode, rootNode)
xmlGen.genXml()
count += 1
print "finish"