Python Post and Get 登陆web后台系统并抓取页面

#coding=utf8
#! /usr/bin/env python

import httplib
import re
import socket
import urllib

timeout = 60
socket.setdefaulttimeout(timeout)


def getTable():

    f = open('kvpage.html')
    page = f.readlines()
    f.close()
    pattern = re.compile(r'.*<tbody>(.*?)</tbody>.*')
    
    for line in page:
        #print line
        m = pattern.match(line.strip())
        if m is not None:
            return m.group(1)
    
    return None

def extractKvEvents(content):
    
    #init result
    table = []
    
    #init pattern
    patternTR = re.compile(r"<tr>(.*?)</tr>")
    patternTD = re.compile(r'<td class="confluenceTd">(.*?)</td>')
    
    #search all the rows
    allrows = patternTR.findall(content)
    if allrows is not None:
        for row in allrows:
            #print row
            cols = patternTD.findall(row)
            if cols is not None:
                
                table.append(cols)
            
    return table

def outputToExcel(table):
    for row in table:
        print row

def loginWiki():

    httpClient = None
    try:
        params = urllib.urlencode({'os_username': 'xxxx@xxx.com',
                                   'os_password': 'xxxx', 
                                   'login': 'Log In'})
        
        headers = {"Content-type": "application/x-www-form-urlencoded"
                        , "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
    
        httpClient = httplib.HTTPConnection("xxx.com", 8080, timeout=30)
        httpClient.request("POST", "/login.action", params, headers)
    
        response = httpClient.getresponse()
#         print response.status
#         print response.reason
#         print response.read()
#         print response.getheaders()
        print response.getheader('Set-Cookie')
        cookieFile = open('cookie.txt', 'w')
        cookieFile.write(response.getheader('Set-Cookie'))
        cookieFile.close()
    except Exception, e:
        print e
    finally:
        if httpClient:
            httpClient.close()

def catchPage():
    httpClient = None

    try:
        #read cookie
        f = open('cookie.txt')
        cookie = f.read().strip()
        print cookie
        f.close()
        
        #init headers
        headers = {"Content-type": "application/x-www-form-urlencoded",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                    'Cookie': cookie}
    
        #send request
        httpClient = httplib.HTTPConnection('xxx.com', 8080, timeout=30)
        httpClient.request('GET', '/xxxPath', headers=headers)
    
        #response是HTTPResponse对象
        response = httpClient.getresponse()
        print response.status
        print response.reason
        
        htmlPage = open('kvpage.html', 'w')
        htmlPage.write(response.read())
        htmlPage.close()
    except Exception, e:
        print e
    finally:
        if httpClient:
            httpClient.close()

if __name__ == '__main__':

    loginWiki()
    catchPage()
    tablecontent = getTable()
    table = extractKvEvents(tablecontent) 
    outputToExcel(table)
 
    

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值