爬虫模拟请求

模拟请求

#!/usr/local/bin/python2.7
# encoding: utf-8
'''
Created on 2015年11月5日

@author: wwhhff11
'''
import urllib2
import urllib
import cookielib
from StringIO import StringIO
import gzip
import chardet

request_header={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Connection':'keep-alive',
    'Cookie':'first_visit_at=T6D98%2FKwfxjiq7DijCm0bURTXEr3oBj6%0A; Hm_lvt_7263598dfd4db0dc29539a51f116b23a=1446729663; Hm_lpvt_7263598dfd4db0dc29539a51f116b23a=1446729702',
    'Host':'www.boohee.com',
    'If-None-Match':'"ce1c755859e73fc92f042cb8305b56c9"',
    'Referer':'http://www.boohee.com/food/',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}

query_string_parameters={
    'keyword':'鸡蛋',
    'page':2
}

request_url='http://www.boohee.com/food/search?';

def unGzipHtml(response):
    #gzip
    if response.info().get('Content-Encoding') == 'gzip':
        buf = StringIO(response.read())
        f = gzip.GzipFile(fileobj=buf)
        html = f.read()
    #no-gzip
    else:
        html = response.read()
    return fixCharset(html)

def fixCharset(html):
    charset=chardet.detect(html)
    return html.decode(charset['encoding'],'ignore').encode('utf-8')

def solve(html):
    #出来html
    pass

if __name__ == '__main__':
    cj=cookielib.LWPCookieJar()
    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

    print request_url
    print urllib.urlencode(query_string_parameters)
    request=urllib2.Request(request_url, 
                            urllib.urlencode(query_string_parameters), 
                            request_header)

    response=opener.open(request)

    html=unGzipHtml(response)

    print html

模拟登录

# -*- coding: utf-8 -*-
'''
Created on 2015年8月13日

@author: wwhhff11
'''

import urllib2
import urllib
import gzip
from StringIO import StringIO
import chardet
from lxml import etree
import cookielib

class TechLogin(object):

    'construction'
    def __init__(self,username,password):
        self.username=username
        self.password=password
        self.loginUrl='https://ids-swust.fayea.com/cas/login'
        self.postHeader={
            'Host': 'ids-swust.fayea.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'https://ids-swust.fayea.com/cas/login?service=https%3A%2F%2Fmatrix%2Edean%2Eswust%2Eedu%2Ecn%2FacadmicManager%2Findex%2Ecfm%3Fevent%3DstudentPortal%3ADEFAULT%5FEVENT',
            'Connection': 'keep-alive'
            }

    'login'
    def login(self):
        cj = cookielib.LWPCookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        postParam=self.getPostParam()
        request=urllib2.Request(self.loginUrl, postParam, self.postHeader)
        html=self.getHtmlContent(request,opener)
        try:
            self.realUrl=url=self.getNextUrl(html)
            html=self.getHtmlContent(url, opener)
            request=urllib2.Request('https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:courseMark',None,self.getHeader())
            html=self.getHtmlContent(request, opener)
            print html
            print 'Login sucess!'
        except:
            print 'Login error!'
            return False
        return True

    'get the param'
    def getPostParam(self):
        postParam={
            'lt': 'LT-82C85EEE-CEB9-3EF6-6EE1931298ED7D61',
            'username': self.username,
            'password': self.password,
            'service': 'https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT'     
            }
        return urllib.urlencode(postParam)

    'get the content of html'
    def getHtmlContent(self,request,opener):
        response = opener.open(request)
        'gzip and no-gzip'
        if response.info().get('Content-Encoding') == 'gzip':
            buf = StringIO(response.read())
            f = gzip.GzipFile(fileobj=buf)
            html = f.read()
        else:
            html = response.read()
        return self.transCharset(html)

    'trans the charset'
    def transCharset(self,html):
        charset=chardet.detect(html)
        return html.decode(charset['encoding'],'ignore').encode('utf-8')

    'get the real url'
    def getNextUrl(self,html):
        page=etree.HTML(html)
        hrefs=page.xpath(u'//a[@class="btn btn-primary"]')
        try:
            return hrefs[0].get("href")
        except Exception as e:
            return None;

    'grade header'    
    def getHeader(self):
        header={
            'Host': 'matrix.dean.swust.edu.cn',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': self.realUrl,
            'Connection': 'keep-alive'
            }
        return header

if __name__ == '__main__':
    demo=TechLogin('xxxxxxxx','xxxxxx')
    demo.login()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值