#-*-coding:utf8-*-
import re
import urllib2
#open internet
def get_stock_html(url):
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel = 0),
)
opener.addheaders = [
('User-agent',
"Mozilla/4.0 (compatible;MSIE 7.0;"
"Windows NT 5.1; .NET CLR 2.0.50727;"
".NET CLR 3.0.4506.2152; .NET CLR 3.5.30729")
]
#url = "http://dean.swjtu.edu.cn//servlet/CourseInfoMapAction?MapID=101&PageUrl=..%2Fother%2FCourseList.jsp&OrderType=choose_course_code&OrderValue=asc&SelectAction=QueryAll&KeyWord1=&TeachType=all&SelectTableType=ThisTerm&jumpPage=2"
response = opener.open(url)
return ''.join(response.readlines())
#----------------------------------------------------------------------
def get_message(data):
"""get zhe message"""
message_zong = re.findall('<tr bgcolor="#ffffff">(.*?)</tr>',data,re.S)
for message in message_zong:
message_fen = re.findall('align="center"(.*?)</td>',message,re.S)
xuhao = re.search('>(.*)',message_fen[0]).group(1)
bianhao = re.search('>(.*)',message_fen[1],re.S).group(1)
daima = re.search('<u>(.*?)</u>',message_fen[2],re.S).group(1)
classname = re.search('>(.*)',message_fen[3],re.S).group(1)
xuefen = re.search('>(.*)',message_fen[4],re.S).group(1)
kechengxingzhi = re.search('<u>(.*?)</u>',message_fen[5],re.S).group(1)
jiaoxuebanhao =re.search('>(.*)',message_fen[6],re.S).group(1)
kaikeyuanxi = re.search('<u>(.*?)</u>',message_fen[7],re.S).group(1)
teacher = re.search('<u>(.*?)</u>',message_fen[8],re.S).group(1)
zhicheng = re.search('>(.*)',message_fen[9],re.S).group(1)
time_lianshi = re.findall('>(.*?)<br',message_fen[10],re.S)
time =''
for data in time_lianshi:
time = time + data.strip()
youxuan = re.search('>(.*)',message_fen[11],re.S).group(1)
zhuangtai = re.search('>(.*)',message_fen[12],re.S).group(1)
xiaoqu = re.search('<u>(.*?)</u>',message_fen[13],re.S).group(1)
didian = re.search('<u>(.*?)</u>',message_fen[14],re.S).group(1)
messageni = xuhao +'\t' + bianhao +'\t' + daima +'\t' + \
classname +'\t' + xuefen +'\t' + kechengxingzhi +'\t' +\
jiaoxuebanhao +'\t' + kaikeyuanxi +'\t' + teacher +'\t' +\
zhicheng +'\t' + time +'\t' + youxuan +'\t' + \
zhuangtai +'\t' + didian +'\n'
okdata.write(messageni)
#read the data
#f = open('data\\page1.txt','r')
#data = get_stock_html()
#data = unicode(data, "utf8").encode("gb2312")
#f.close()
okdata = open('data\\okdata1.xlsx','a+')
url = "http://dean.swjtu.edu.cn//servlet/CourseInfoMapAction?MapID=101&PageUrl=..%2Fother%2FCourseList.jsp&OrderType=choose_course_code&OrderValue=asc&SelectAction=QueryAll&KeyWord1=&TeachType=all&SelectTableType=ThisTerm&jumpPage=2"
for i in range(1,66):
new_link = re.sub('jumpPage=\d+','jumpPage=%d'%i,url,re.S)
html = get_stock_html(new_link)
get_message(html)
print 'finsh'
print i
okdata.close()
其中 get_stock_html是获得网页的代码
get_message 是正则表达式部分。获得有用数据
遇到的问题:
注意re.search 最后要有group才行 本段代码还有不完善尤其是对findall和search