python爬去学校_python爬虫程序爬学校教务网的课程信息

#-*-coding:utf8-*-

import re

import urllib2

#open internet

def get_stock_html(url):

opener = urllib2.build_opener(

urllib2.HTTPRedirectHandler(),

urllib2.HTTPHandler(debuglevel = 0),

)

opener.addheaders = [

('User-agent',

"Mozilla/4.0 (compatible;MSIE 7.0;"

"Windows NT 5.1; .NET CLR 2.0.50727;"

".NET CLR 3.0.4506.2152; .NET CLR 3.5.30729")

]

#url = "http://dean.swjtu.edu.cn//servlet/CourseInfoMapAction?MapID=101&PageUrl=..%2Fother%2FCourseList.jsp&OrderType=choose_course_code&OrderValue=asc&SelectAction=QueryAll&KeyWord1=&TeachType=all&SelectTableType=ThisTerm&jumpPage=2"

response = opener.open(url)

return ''.join(response.readlines())

#----------------------------------------------------------------------

def get_message(data):

"""get zhe message"""

message_zong = re.findall('

',data,re.S)

for message in message_zong:

message_fen = re.findall('align="center"(.*?)',message,re.S)

xuhao = re.search('>(.*)',message_fen[0]).group(1)

bianhao = re.search('>(.*)',message_fen[1],re.S).group(1)

daima = re.search('(.*?)',message_fen[2],re.S).group(1)

classname = re.search('>(.*)',message_fen[3],re.S).group(1)

xuefen = re.search('>(.*)',message_fen[4],re.S).group(1)

kechengxingzhi = re.search('(.*?)',message_fen[5],re.S).group(1)

jiaoxuebanhao =re.search('>(.*)',message_fen[6],re.S).group(1)

kaikeyuanxi = re.search('(.*?)',message_fen[7],re.S).group(1)

teacher = re.search('(.*?)',message_fen[8],re.S).group(1)

zhicheng = re.search('>(.*)',message_fen[9],re.S).group(1)

time_lianshi = re.findall('>(.*?)

time =''

for data in time_lianshi:

time = time + data.strip()

youxuan = re.search('>(.*)',message_fen[11],re.S).group(1)

zhuangtai = re.search('>(.*)',message_fen[12],re.S).group(1)

xiaoqu = re.search('(.*?)',message_fen[13],re.S).group(1)

didian = re.search('(.*?)',message_fen[14],re.S).group(1)

messageni = xuhao +'\t' + bianhao +'\t' + daima +'\t' + \

classname +'\t' + xuefen +'\t' + kechengxingzhi +'\t' +\

jiaoxuebanhao +'\t' + kaikeyuanxi +'\t' + teacher +'\t' +\

zhicheng +'\t' + time +'\t' + youxuan +'\t' + \

zhuangtai +'\t' + didian +'\n'

okdata.write(messageni)

#read the data

#f = open('data\\page1.txt','r')

#data = get_stock_html()

#data = unicode(data, "utf8").encode("gb2312")

#f.close()

okdata = open('data\\okdata1.xlsx','a+')

url = "http://dean.swjtu.edu.cn//servlet/CourseInfoMapAction?MapID=101&PageUrl=..%2Fother%2FCourseList.jsp&OrderType=choose_course_code&OrderValue=asc&SelectAction=QueryAll&KeyWord1=&TeachType=all&SelectTableType=ThisTerm&jumpPage=2"

for i in range(1,66):

new_link = re.sub('jumpPage=\d+','jumpPage=%d'%i,url,re.S)

html = get_stock_html(new_link)

get_message(html)

print 'finsh'

print i

okdata.close()

其中 get_stock_html是获得网页的代码

get_message 是正则表达式部分。获得有用数据

遇到的问题:

注意re.search 最后要有group才行 本段代码还有不完善尤其是对findall和search

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值