# -*- coding:utf-8 -*-
#python版本: 2.7.6
import urllib2
import re
import os
import time
import datetime
def get_content(url, my_headers, pageB, pageE):
"""
@my_headers: 使用伪装浏览器的方法避免403禁止访问
@pageB: 获取排名开始页
@pageE: 获取排名结束页
@url: 要获取的网页伪链接(这点不好)
@获取网页信息
"""
i = pageB
content1 = ""
while i <= pageE:
print "Dealing with the %dth pages" % i
real_url = url + ("%d" % i)
req = urllib2.Request(real_url, headers = my_headers)
content1 += urllib2.urlopen(req).read()
i += 1
content1.decode("utf-8")
pattern = r'(([ ]|[\n]|[\r])+?)'
content = re.sub(pattern, '', content1, count = 0)
preg = r'ahref="profile\.php\?userid=(.+?)">(.+?)</a></td><td>(\d+?)</td><td>(\d+?)</td><tdclass="tar">(\d+?)</td><tdclass="tar">(.+?)</td>'
pat = re.compile(preg) #add this line, the code will running quickly
msg_code = re.findall(pat, content)
return msg_code
def save_msg(content):
"""
@content: 要保存的内容
"""
f = open('ranking.txt', 'a+')
f.write(content)
f.close()
def get_ranking(url, my_headers, pageBegin, pageEnd):
"""
@url: 要获取的网页伪链接(这点不好)
@分批获取网页信息(防止一次处理信息量过大),设置提示
@my_headers: 使用伪装浏览器的方法避免403禁止访问
@pageBegin: 获取排名开始页
@pageEnd: 获取排名结束页
"""
names = "Leida_龄子Leida_邱仁团Leida_吴小雪Leida_曹晨霞Leida_孙志旺Leida_陈志振Leida_吴文洁Leida_周娟娟Leida_樊泽亮Leida_褚后屹leida"
#名次序号
num = 1
oper_num = 5
#本次操作时间点
save_msg(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"\n")
while pageBegin < pageEnd :
content = ""
if pageEnd - pageBegin >= oper_num :
end = pageBegin + oper_num - 1
else :
end = pageEnd
lists = get_content(url, my_headers, pageBegin, end)
for li in lists:
if names.find(li[1]) != -1:
content += ("%5d" % num)
step = 1
while step < 6:
content += " "
content += li[step]
step += 1
content += '\n'
num += 1
pageBegin = end + 1
#写到本地
save_msg(content)
if __name__ == "__main__":
"""
@网站卡的时候访问会很慢
@数据保存在本地的ranking.txt文件里面
"""
url = "http://acm.nyist.net/JudgeOnline/rank.php?page="
myHeaders = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
"Referer": "http://acm.nyist.net/JudgeOnline/problemset.php",
'Host':"acm.nyist.net"
}
pageBegin = 1
pageEnd = 20
get_ranking(url, myHeaders, pageBegin, pageEnd)
python抓取南阳理工学院ACM网站排名信息
最新推荐文章于 2018-10-06 09:49:18 发布