# -*- coding: utf-8 -*- import urllib2 import cookielib import urllib import Image import cStringIO from pytesser import * import re import os #避免 UnicodeEncodeError: 'ascii' codec can't encode character. 的报错 import sys reload(sys) sys.setdefaultencoding( "utf-8" ) #下面这段是关键了,将为urlib2.urlopen绑定cookies #MozillaCookieJar(也可以是 LWPCookieJar ,这里模拟火狐,所以用这个了) 提供可读写操作的cookie文件,存储cookie对象 cookiejar = cookielib.MozillaCookieJar() # 将一个保存cookie对象,和一个HTTP的cookie的处理器绑定 cookieSupport= urllib2.HTTPCookieProcessor(cookiejar) #下面两行为了调试的 httpHandler = urllib2.HTTPHandler(debuglevel=1) httpsHandler = urllib2.HTTPSHandler(debuglevel=1) #创建一个opener,将保存了cookie的http处理器,还有设置一个handler用于处理http的 opener = urllib2.build_opener(cookieSupport, httpsHandler) #将包含了cookie、http处理器、http的handler的资源和urllib2对象绑定在一起,安装opener,此后调用urlopen()时都会使用安装过的opener对象, urllib2.install_opener(opener) #登陆页面 loginpage = "http://zhuzhou2013.feixuelixm.teacher.com.cn/IndexPage/Index.aspx" #要post的url LoginUrl = "http://zhuzhou2013.feixuelixm.teacher.com.cn/GuoPeiAdmin/Login/Login.aspx" ##打开登陆页面, 以此来获取cookies 。 但是因为 ##打开验证码页面就可以获取全部cookies了,所以可以直接跳过这一步。算是可有可无的 #taobao = urllib2.urlopen(loginpage) ##打印cookies #print cookiejar ##先打开页面获取的cookie与 后打开验证码页面的cookie不同。 ##提取验证码text(手动输入验证码) #vrifycodeUrl = "http://zhuzhou2013.feixuelixm.teacher.com.cn/GuoPeiAdmin/Login/ImageLog.aspx" #file = urllib2.urlopen(vrifycodeUrl) #pic= file.read() #path = "c:code.jpg" ##img = cStringIO.StringIO(file) # constructs a StringIO holding the image AttributeError: addinfourl instance has no attribute 'seek' #localpic = open(path,"wb") #localpic.write(pic) #localpic.close() #print "please %s,open code.jpg"%path ##text =raw_input("input code :") #im = Image.open(path) #text =image_to_string(im) #print text #提取验证码地址(用pytesser 识别, 自己网上找教程安装) #并且用pytesser 识别验证码,赋值给 text ,并打印出来。 vrifycodeUrl = "http://zhuzhou2013.feixuelixm.teacher.com.cn/GuoPeiAdmin/Login/ImageLog.aspx" file = urllib2.urlopen(vrifycodeUrl).read() img = cStringIO.StringIO(file) # constructs a StringIO holding the image AttributeError: addinfourl instance has no attribute 'seek' im = Image.open(img) text = image_to_string(im) print "vrifycode:", text #设置cookie的值,因为post request head 需要 返回 cookie (不是cookies ,是将cookies的格式处理后的值) cookies = '' #这里要从 for index, cookie in enumerate(cookiejar): #print '[',index, ']'; #print cookie.name; #print cookie.value; #print "###########################" cookies = cookies+cookie.name+"="+cookie.value+";"; print "###########################" cookie = cookies[:-1] print "cookies:",cookie #用户名,密码 #当然,我这里登顶要处理掉密码和用户名 #username = "7879954564555664" #password = "12313164" #用户名,密码 username = "430223198809308045" password = "56961888" #请求数据包 postData = { '__EVENTTARGET':'', '__EVENTARGUMENT':'', '__VIEWSTATE': '/wEPDwUKLTcyMzEyMTY2Nw8WAh4LTG9naW5lZFBhZ2UFEExvZ2luZWRQYWdlLmFzcHgWAmYPZBYCZg8PZBYGHgV0aXRsZQUg55So5oi35ZCNL+WtpuS5oOeggS/ouqvku73or4Hlj7ceB29uZm9jdXMFEGNoZWNrSW5wdXQodGhpcykeBm9uYmx1cgUNcmVzdG9yZSh0aGlzKWQYAQUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFC0ltZ2J0bkxvZ2luckJjpNhrusWhtPuT33UJ1dBUkvw=', 'txtUserName':username, 'txtPassWord':password, 'txtCode':text, 'ImgbtnLogin.x':44 , 'ImgbtnLogin.y':14, 'ClientScreenWidth':1180 } #post请求头部 headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-cn,en-us;q=0.8,zh;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Host': 'zhuzhou2013.feixuelixm.teacher.com.cn', 'Cookie':cookies, 'User-Agent' : 'Mozilla/5.0 (Windows NT 5.1; rv:29.0) Gecko/20100101 Firefox/29.0', 'Referer' : 'http://zhuzhou2013.feixuelixm.teacher.com.cn/GuoPeiAdmin/Login/Login.aspx', #'Content-Type': 'application/x-www-form-urlencoded', #'Content-Length' :474, 'Connection' : 'Keep-Alive' } #合成post数据 data = urllib.urlencode(postData) print "data:###############" print data #创建request #构造request请求 request = urllib2.Request( LoginUrl,data,headers ) try: #访问页面 response = urllib2.urlopen(request) #cur_url = response.geturl() #print "cur_url:",cur_url status = response.getcode() print status except urllib2.HTTPError, e: print e.code #将响应的网页打印到文件中,方便自己排查错误 #必须对网页进行解码处理 f= response.read().decode("utf8") outfile =open("rel_ip.txt","w") print >> outfile , "%s" % ( f) #但因响应的信息 info = response.info() print info #测试登陆是否成功,因为在testurl只有登陆后才能访问 testurl = "http://zhuzhou2013.feixuelixm.teacher.com.cn/GuoPeiAdmin/Login/LoginedPage.aspx" try: response = urllib2.urlopen(testurl) except urllib2.HTTPError, e: print e.code #因为后面要从网页查找字符来验证登陆成功与否,所以要保证查找的字符与网页编码相同,否则无非得到正确的结论。建议用英文查找,如css中的 id, name 之类的。 f= response.read().decode("utf8").encode("utf8") outfile =open("out_ip.txt","w") print >> outfile , "%s" % ( f) #在返回的网页中,查找“你好” 两个字符,因为只有登陆成功后才有两个字,找到了即表示登陆成功。建议用英文 tag = '你好'.encode("utf8") if re.search( tag,f): #登陆成功 print 'Logged in successfully!' else: #登陆失败 print 'Logged in failed, check result.html file for details' response.close() |