最近在学Python的正则表达式,简直就是大神器,爬虫也蛮有意思的。
'''
Created on 2013-9-19
for poj
@author: yzp
'''
import urllib
import re
import os
import sys
def SubTitle(web):
key = "<title>.+</title>"
title = re.findall(key,web)
return title
def get_problem(i):
filename = "problem"+repr(i)
fl = file(filename,'w')
url = "http://poj.org/problem?id="+repr(i)
f = urllib.urlopen(url)
html = f.read();
title = SubTitle(html)
problem = re.compile(u'Description.*Source',re.DOTALL)
style = problem.search(html)
if style:
html = style.group(0)
html = title[0]+html
tmp = re.sub('<[^>]*>','',html)
tmp = tmp.replace('Description','Description\n')
tmp = tmp.replace('Sample Input','\nSample Input\n')
tmp = tmp.replace('Sample Output','\nSample Output\n')
tmp = tmp.replace('Source','')
tmp = re.sub(';','',tmp)
fl.write(tmp)
def main():
for i in range(1001,4054):
get_problem(i)
main()
'''
Created on 2013-9-19
for hdoj
@author: yzp
'''
import urllib
#print(urllib.urlopen('http://news.qq.com/a/20130919/000593.htm').read())
import re
import os
import sys
def get_problem(i):
filename = "problem"+repr(i)
fl = file(filename,'w')
url = "http://acm.hdu.edu.cn/showproblem.php?pid="+repr(i)
f = urllib.urlopen(url)
html = f.read();
problem = re.compile(u'Problem Description.*Author',re.DOTALL)
style = problem.search(html)
if style:
html = style.group(0)
tmp = re.sub('<[^>]*>','',html)
tmp = tmp.replace(' ','')
tmp = tmp.replace('Author','')
tmp = tmp.replace('Problem Description','Problem Description\n')
tmp = tmp.replace('Sample Input','\nSample Input\n')
tmp = tmp.replace('Sample Output','\nSample Output\n')
tmp = re.sub(';','',tmp)
fl.write(tmp)
#print(tmp)
def main():
for i in range(1000,4758):
get_problem(i)
main()