模拟请求
'''
Created on 2015年11月5日
@author: wwhhff11
'''
import urllib2
import urllib
import cookielib
from StringIO import StringIO
import gzip
import chardet
request_header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Cookie':'first_visit_at=T6D98%2FKwfxjiq7DijCm0bURTXEr3oBj6%0A; Hm_lvt_7263598dfd4db0dc29539a51f116b23a=1446729663; Hm_lpvt_7263598dfd4db0dc29539a51f116b23a=1446729702',
'Host':'www.boohee.com',
'If-None-Match':'"ce1c755859e73fc92f042cb8305b56c9"',
'Referer':'http://www.boohee.com/food/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
query_string_parameters={
'keyword':'鸡蛋',
'page':2
}
request_url='http://www.boohee.com/food/search?';
def unGzipHtml(response):
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
html = f.read()
else:
html = response.read()
return fixCharset(html)
def fixCharset(html):
charset=chardet.detect(html)
return html.decode(charset['encoding'],'ignore').encode('utf-8')
def solve(html):
pass
if __name__ == '__main__':
cj=cookielib.LWPCookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
print request_url
print urllib.urlencode(query_string_parameters)
request=urllib2.Request(request_url,
urllib.urlencode(query_string_parameters),
request_header)
response=opener.open(request)
html=unGzipHtml(response)
print html
模拟登录
'''
Created on 2015年8月13日
@author: wwhhff11
'''
import urllib2
import urllib
import gzip
from StringIO import StringIO
import chardet
from lxml import etree
import cookielib
class TechLogin(object):
'construction'
def __init__(self,username,password):
self.username=username
self.password=password
self.loginUrl='https://ids-swust.fayea.com/cas/login'
self.postHeader={
'Host': 'ids-swust.fayea.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'https://ids-swust.fayea.com/cas/login?service=https%3A%2F%2Fmatrix%2Edean%2Eswust%2Eedu%2Ecn%2FacadmicManager%2Findex%2Ecfm%3Fevent%3DstudentPortal%3ADEFAULT%5FEVENT',
'Connection': 'keep-alive'
}
'login'
def login(self):
cj = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
postParam=self.getPostParam()
request=urllib2.Request(self.loginUrl, postParam, self.postHeader)
html=self.getHtmlContent(request,opener)
try:
self.realUrl=url=self.getNextUrl(html)
html=self.getHtmlContent(url, opener)
request=urllib2.Request('https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:courseMark',None,self.getHeader())
html=self.getHtmlContent(request, opener)
print html
print 'Login sucess!'
except:
print 'Login error!'
return False
return True
'get the param'
def getPostParam(self):
postParam={
'lt': 'LT-82C85EEE-CEB9-3EF6-6EE1931298ED7D61',
'username': self.username,
'password': self.password,
'service': 'https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT'
}
return urllib.urlencode(postParam)
'get the content of html'
def getHtmlContent(self,request,opener):
response = opener.open(request)
'gzip and no-gzip'
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
html = f.read()
else:
html = response.read()
return self.transCharset(html)
'trans the charset'
def transCharset(self,html):
charset=chardet.detect(html)
return html.decode(charset['encoding'],'ignore').encode('utf-8')
'get the real url'
def getNextUrl(self,html):
page=etree.HTML(html)
hrefs=page.xpath(u'//a[@class="btn btn-primary"]')
try:
return hrefs[0].get("href")
except Exception as e:
return None;
'grade header'
def getHeader(self):
header={
'Host': 'matrix.dean.swust.edu.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Referer': self.realUrl,
'Connection': 'keep-alive'
}
return header
if __name__ == '__main__':
demo=TechLogin('xxxxxxxx','xxxxxx')
demo.login()