Python学习—网络爬虫之OJ

最新推荐文章于 2022-10-09 19:22:54 发布

mowayao

最新推荐文章于 2022-10-09 19:22:54 发布

阅读量861

点赞数

文章标签： python

本文链接：https://blog.csdn.net/mowayao/article/details/11953691

版权

最近在学Python的正则表达式，简直就是大神器，爬虫也蛮有意思的。

'''
Created on 2013-9-19
for poj
@author: yzp
'''
import urllib
import re
import os
import sys
def SubTitle(web):
    key = "<title>.+</title>"
    title = re.findall(key,web)
    return title
def get_problem(i):
    
    filename = "problem"+repr(i)
    fl = file(filename,'w')
    url = "http://poj.org/problem?id="+repr(i)
    f = urllib.urlopen(url)
    
    html = f.read();
    title = SubTitle(html)
    problem = re.compile(u'Description.*Source',re.DOTALL)
    style = problem.search(html)
    if style:
        html = style.group(0)
        html = title[0]+html
        tmp = re.sub('<[^>]*>','',html)
        tmp = tmp.replace('Description','Description\n')
        tmp = tmp.replace('Sample Input','\nSample Input\n')
        tmp = tmp.replace('Sample Output','\nSample Output\n')
        tmp = tmp.replace('Source','')
        tmp = re.sub(';','',tmp)   
        fl.write(tmp)
def main():
    
    for i in range(1001,4054):
        get_problem(i)
main()

'''
Created on 2013-9-19
for hdoj
@author: yzp
'''
import urllib
#print(urllib.urlopen('http://news.qq.com/a/20130919/000593.htm').read())
import re
import os
import sys

def get_problem(i):
    filename = "problem"+repr(i)
    fl = file(filename,'w')
    url = "http://acm.hdu.edu.cn/showproblem.php?pid="+repr(i)
    f = urllib.urlopen(url)
    html = f.read();
    problem = re.compile(u'Problem Description.*Author',re.DOTALL)
    style = problem.search(html)
    if style:
        html = style.group(0)
        tmp = re.sub('<[^>]*>','',html)
        tmp = tmp.replace(' ','')
        tmp = tmp.replace('Author','')
        tmp = tmp.replace('Problem Description','Problem Description\n')
        tmp = tmp.replace('Sample Input','\nSample Input\n')
        tmp = tmp.replace('Sample Output','\nSample Output\n')
        tmp = re.sub(';','',tmp)   
        fl.write(tmp)
    #print(tmp)
    
def main():
    
    for i in range(1000,4758):
        get_problem(i)
main()

mowayao

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python学习—网络爬虫之OJ

最近在学Python的正则表达式，简直就是大神器，爬虫也蛮有意思的。'''Created on 2013-9-19for poj@author: yzp'''import urllib#print(urllib.urlopen('http://news.qq.com/a/20130919/000593.htm').read())import reimport osimpo
复制链接

扫一扫