python抓取南阳理工学院ACM网站排名信息

# -*- coding:utf-8 -*-
#python版本: 2.7.6

import urllib2
import re
import os
import time
import datetime

def get_content(url, my_headers, pageB, pageE):
  """
  @my_headers: 使用伪装浏览器的方法避免403禁止访问
  @pageB: 获取排名开始页
  @pageE: 获取排名结束页
  @url: 要获取的网页伪链接(这点不好)
  @获取网页信息
  """
  i = pageB
  content1 = ""
  while i <= pageE:
    print "Dealing with the %dth pages" % i
    real_url = url + ("%d" % i)
    req = urllib2.Request(real_url, headers = my_headers)
    content1 += urllib2.urlopen(req).read()
    i += 1
  content1.decode("utf-8")
  pattern = r'(([ ]|[\n]|[\r])+?)'
  content = re.sub(pattern, '', content1, count = 0)

  preg = r'ahref="profile\.php\?userid=(.+?)">(.+?)</a></td><td>(\d+?)</td><td>(\d+?)</td><tdclass="tar">(\d+?)</td><tdclass="tar">(.+?)</td>'
  pat = re.compile(preg) #add this line, the code will running quickly
  msg_code = re.findall(pat, content)

  return msg_code

def save_msg(content):
  """
  @content: 要保存的内容
  """
  f = open('ranking.txt', 'a+')
  f.write(content)
  f.close()

def get_ranking(url, my_headers, pageBegin, pageEnd):
  """
  @url: 要获取的网页伪链接(这点不好)
  @分批获取网页信息(防止一次处理信息量过大),设置提示
  @my_headers: 使用伪装浏览器的方法避免403禁止访问
  @pageBegin: 获取排名开始页
  @pageEnd: 获取排名结束页
  """
  names = "Leida_龄子Leida_邱仁团Leida_吴小雪Leida_曹晨霞Leida_孙志旺Leida_陈志振Leida_吴文洁Leida_周娟娟Leida_樊泽亮Leida_褚后屹leida"
  #名次序号
  num = 1
  oper_num = 5
  #本次操作时间点
  save_msg(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"\n")
  while pageBegin < pageEnd :
    content = ""
    if pageEnd - pageBegin >= oper_num :
      end = pageBegin + oper_num - 1
    else :
      end = pageEnd
    lists = get_content(url, my_headers, pageBegin, end)
    for li in lists:
      if names.find(li[1]) != -1:
        content += ("%5d" % num)
        step = 1
        while step < 6:
          content += "   "
          content += li[step]
          step += 1
        content += '\n'
        num += 1
    pageBegin = end + 1
    #写到本地
    save_msg(content)

if __name__ == "__main__":
  """
  @网站卡的时候访问会很慢
  @数据保存在本地的ranking.txt文件里面
  """
  url = "http://acm.nyist.net/JudgeOnline/rank.php?page="
  myHeaders = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
    "Referer": "http://acm.nyist.net/JudgeOnline/problemset.php",
    'Host':"acm.nyist.net"
  }
  pageBegin = 1
  pageEnd = 20
  get_ranking(url, myHeaders, pageBegin, pageEnd)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值