python的requests模块是个神器,这里用request模块实现模拟登登陆:
#coding:utf-8
import sys
import requests
from bs4 import BeautifulSoup
import re
from pylsy import pylsytable
#验证码识别#
import os
os.chdir("C:\Python27\Lib\site-packages")
from pytesser import *
#验证码识别的库
login_url = 'http://mis.teach.ustc.edu.cn/userinit.do'
a_url = 'http://mis.teach.ustc.edu.cn/login.do'
pre_url = 'http://mis.teach.ustc.edu.cn/'
grades_url = 'http://mis.teach.ustc.edu.cn/querycjxx.do'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36',
'Referer': 'http://mis.teach.ustc.edu.cn/userinit.do',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
}
pre_data = {'userbz': 's'}
login_data = {
'userbz': 's',
'hidjym': '',
}
grades_data = {
'xuenian': '',
'chaxun': '+%B2%E9++%D1%AF+',
'px': '1',
'zd': '0',
}
#提交post指令
def judging(name):
divide = 125#阙值根据具体调试
list1 = []
for i in range(256):
if i < divide:
list1.append(0)
else:
list1.append(1)
image = Image.open(name)
image2 = image.convert('L')
#根据阙值二值化分割
image_text = image2.point(table,'1')
return image_to_string(image_text)
#识别率有待改进
def getGrades(filename):
userid=raw_input("name:")
password=raw_input("password:")
s = requests.Session()
login_r = s.post(login_url, headers=headers, data=pre_data)
soup = BeautifulSoup(login_r.text,"html.parser")
img_src = pre_url + soup.find('img', id='random')['src']
f = open('c.png', 'wb')
img = s.get(img_src)
f.write(img.content)
f.close()
code = judging('c.png')
login_data['userCode']=userid
login_data['passWord']=password
login_data['check'] = code
li_r = s.post(a_url, headers=headers, data=login_data)
grades = s.post(grades_url, headers=headers, data=grades_data)
f = open(filename, 'w')
reload(sys)
sys.setdefaultencoding('utf8')
f.writelines(grades.text)
f.close()
def sousa(filename):
f = open(filename)
text = f.read()
#html.parser
soup=BeautifulSoup(text,"html.parser")
trs=soup.find_all('tr',class_='bg')
courseName=[]
courseGrades=[]
courseGPA=[]
del trs[0]
for course in trs:
tds=course.find_all('td',class_='bg')
courseName.append(tds[2].string)
courseGrades.append(tds[4].string)
courseGPA.append(tds[6].string)
return (courseName,courseGrades,courseGPA)
def writeGrades(filename):
courseName,courseGrades,courseGPA=sousa()
f=open(filename,'w')
for i in range(len(courseGPA)):
f.write('%s %s %s \n' % (courseName[i],courseGrades[i],courseGPA[i]))
f.close()
if __name__ == '__main__':
getGrades('test.txt')
courseName,courseGrades,courseGPA=sousa('test.txt')
attributes=['courseName','courseGrades','coursePoints']
table=pylsytable(attributes)
table.add_data('courseName',courseName)
table.add_data('courseGrades',courseGrades)
table.add_data('coursePoints',courseGPA)
print table
利用requests.Session()并构造post指令,具体情况具体分析。
图像处理用到了PIL,pytesser库 ,pytesser调用的tesseract是谷歌的一个用于识别的开源框架,可用于数字、字母、汉字识别(需要优化)。
相关主要代码:
image = Image.open(name)
image_text = image2.point(table,'1')
return image_to_string(image_text)