#-*-coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class spider(object):
def __init__(self):
print u'开始爬取内容。。。'
#getsource用来获取网页源代码
def getsource(self,url):
html = requests.get(url)
return html.text
#changepage用来生产不同页数的链接
def changepage(self,url,total_page):
now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
page_group = []
for i in range(now_page,total_page+1):
link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
page_group.append(link)
return page_group
#geteveryclass用来抓取每个课程块的信息
def geteveryclass(self,source):
everyclass = re.findall('(
)',source,re.S)return everyclass
#getinfo用来