1 #-*- coding=utf-8 -*- 2 import requests 3 import re 4 import json 5 import time 6 from PIL import Image 7 import cStringIO 8 import cookielib 9 import urllib 10 import os 11 import xlrd 12 13 from requests.packages.urllib3.exceptions import InsecureRequestWarning,InsecurePlatformWarning 14 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 15 requests.packages.urllib3.disable_warnings(InsecurePlatformWarning) 16 17 data=xlrd.open_workbook('1.xlsx') 18 table=data.sheet_by_name(u'Sheet1') 19 20 message_url='https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT' 21 login_url='https://matrix.dean.swust.edu.cn/cas/login' 22 topic_url='' 23 flag=0 24 temp='' 25 pic_count=1 26 27 student = {} 28 student = { 29 '学号':'', 30 '姓名':'', 31 '性别':'', 32 '生日':'', 33 'pic':'', 34 '民族':'', 35 '行政班':'', 36 '专业':'', 37 } 38 39 headers={ 40 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 41 } 42 43 session=requests.Session() 44 session.headers=headers 45 session.cookies = cookielib.LWPCookieJar(filename='cookies') 46 # try: 47 # session.cookies.load(ignore_discard=True) 48 # except: 49 # print u"未登陆过,需先登录" 50 51 52 def get_lt(url="https://matrix.dean.swust.edu.cn/cas/login"): 53 '''''_lt 是一个动态变化的参数''' 54 global session 55 index_url = url 56 index_page = session.get(index_url,verify=False) 57 html = index_page.content 58 pattern = r'name="lt" type="hidden" value="(.*?)"' 59 lt = re.findall(pattern, html) 60 return lt[0] 61 62 def login(username,password): 63 global session 64 global topic_url 65 global flag 66 data={ 67 'lt':get_lt(), 68 'username':username, 69 'password':password, 70 'service':'https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT', 71 } 72 loginurl=login_url 73 try: 74 login_page=session.post(loginurl,data=data) 75 login_code=login_page.content 76 pattern=r'<a class="btn btn-primary" href="(.*?)"' 77 real_url=re.findall(pattern, login_code) 78 topic_url=real_url[0] 79 flag=1 80 except: 81 pass 82 session.cookies.save() 83 84 def error_clean(error_temp): 85 global student 86 global temp 87 if(error_temp==temp): 88 session.cookies.clear() 89 student = { 90 '学号':'', 91 '姓名':'', 92 '性别':'', 93 '生日':'', 94 'pic':'', 95 '民族':'', 96 '行政班':'', 97 '专业':'', 98 } 99 flag=0 100 topic_url='' 101 else: 102 pass 103 104 105 106 107 def isLogin(): 108 global session 109 url = "https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT" 110 login_code = session.get(url, allow_redirects=False).status_code 111 if int(x=login_code) == 200: 112 return True 113 else: 114 return False 115 116 def get_message(): 117 global session 118 global topic_url 119 global message_url 120 global student 121 122 html=session.get(topic_url) 123 html=session.get(message_url).text 124 125 pattern_ming=r'<td>(.*?)</td>' 126 pattern_id=r'<span class="number">(.*?)</span>' 127 pattern_pic=r'<td style="padding:0;" width="135" height="180" valign="middle" align="center" rowspan="6"><img width="135" height="180" align="middle" src="(.*?)" /></td>' 128 message_name=re.findall(pattern_ming, html) 129 message_pic=re.findall(pattern_pic, html) 130 try: 131 student['学号']=re.findall(r'<span class="number">(\d*?)</span>', message_name[2])[0] 132 student['姓名']=message_name[4] 133 student['性别']=message_name[6] 134 student['专业']=message_name[37] 135 student['行政班']=message_name[27] 136 student['pic']='https://matrix.dean.swust.edu.cn/acadmicManager/student/profile/'+student['学号']+'.jpg' 137 138 except: 139 pass 140 141 #student['生日']=re.findall(r'<span class="number">(.*?)</span>', message_name[8])[0] 142 #student['民族']=message_name[10] 143 144 145 def download(): 146 global student 147 global session 148 global temp 149 global pic_count 150 basepath=os.path.abspath('.') 151 savepath=os.path.join(basepath,student['专业']) 152 if not os.path.exists(savepath): 153 os.mkdir(savepath) 154 try: 155 picpath=os.path.join(savepath,student['姓名']+student['学号']+'.jpg') 156 r=session.get(student['pic']) 157 with open(picpath, "wb") as pic: 158 pic.write(r.content) 159 print u'>>>>>>>>>成功抓取>>>>>>>>>>>>>>>>>>>>'+student['姓名'] 160 temp=student['姓名'] 161 session.cookies.clear() 162 except Exception, e: 163 pass 164 165 166 if __name__ == '__main__': 167 count=table.nrows 168 i=5000 169 while(count>0): 170 if(table.col_values(3)[i]==u'女' and table.col_values(2)[i]!=u'王珀会'): 171 try: 172 login(str(int(table.col_values(1)[i])), str(table.col_values(13)[i])[11:17]) 173 except: 174 pass 175 if(flag==1): 176 flag=0 177 get_message() 178 download() 179 count=count-1 180 i=i+1 181 session.cookies.clear()
http://stackoverflow.com/questions/23816139/clear-cookies-from-requests-pytho
总结:
python处理excel>> http://www.cnblogs.com/lhj588/archive/2012/01/06/2314181.html
session释放>>
注明:
1.xlsx为提供学生资料的excel
异常处理之间的妥协关系需要事先计划好