python抓取动态网页_python 如何抓取动态页面内容?

展开全部

输入url,得到2113html,我早就写了函数了

自己搜:

getUrlRespHtml

就可以5261找到对应的python函数:#------------------------------------------------------------------------------

def getUrlResponse(url, postDict={}, headerDict={}, timeout=0, useGzip=False, postDataDelimiter="&") :

"""Get response from url, support optional postDict,headerDict,timeout,useGzip

Note:

1. if postDict not null, url request auto become to POST instead of default GET

2 if you want to auto handle cookies, should call initAutoHandleCookies() before use this function.

then following urllib2.Request will auto handle cookies

"""

# makesure url is string, not unicode, otherwise urllib2.urlopen will error

url = str(url);

if (postDict) :

if(postDataDelimiter=="&"):

postData = urllib.urlencode(postDict);

else:

postData = "";

for eachKey in postDict.keys() :

postData += str(eachKey) + "=" + str(postDict[eachKey]) + postDataDelimiter;

postData = postData.strip();

logging.info("postData=%s", postData);

req = urllib2.Request(url, postData);

logging.info("req=%s", req);

req.add_header('Content-Type', "application/x-www-form-urlencoded");

else :

req = urllib2.Request(url);

defHeaderDict = {

'User-Agent' : gConst['UserAgent'],

'Cache-Control' : 'no-cache',

'Accept' : '*/*',

'Connection' : 'Keep-Alive',

};

# add default headers firstly

for eachDefHd in defHeaderDict.keys() :

#print "add default header: %s=%s"%(eachDefHd,defHeaderDict[eachDefHd]);

req.add_header(eachDefHd, defHeaderDict[eachDefHd]);

if(useGzip) :

#print "use gzip for",url;

req.add_header('Accept-Encoding', 'gzip, deflate');

# add customized header later -> allow overwrite default header

if(headerDict) :

#print "added header:",headerDict;

for key in headerDict.keys() :

req.add_header(key, headerDict[key]);

if(timeout > 0) :

# set timeout value if necessary

resp = urllib2.urlopen(req, timeout=timeout);

else :

resp = urllib2.urlopen(req);

#update cookies into local file

if(gVal['cookieUseFile']):

gVal['cj'].save();

logging.info("gVal['cj']=%s", gVal['cj']);

return resp;

#------------------------------------------------------------------------------

# get response html==body from url

#def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :

def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True, postDataDelimiter="&") :

resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip, postDataDelimiter);

respHtml = resp.read();

#here, maybe, even if not send Accept-Encoding: gzip, deflate

#but still response gzip or deflate, so directly do undecompress

#if(useGzip) :

#print "---before unzip, len(respHtml)=",len(respHtml);

respInfo = resp.info();

# Server: nginx/1.0.8

# Date: Sun, 08 Apr 2012 12:30:35 GMT

# Content-Type: text/html

# Transfer-Encoding: chunked

# Connection: close

# Vary: Accept-Encoding

# ...

# Content-Encoding: gzip

# sometime, the request use gzip,deflate, but actually returned is un-gzip html

# -> response info not include above "Content-Encoding: gzip"

# eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html

# -> so here only decode when it is indeed is gziped data

#Content-Encoding: deflate

if("Content-Encoding" in respInfo):

if("gzip" == respInfo['Content-Encoding']):

respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS);

elif("deflate" == respInfo['Content-Encoding']):

respHtml = zlib.decompress(respHtml, -zlib.MAX_WBITS);

return respHtml;

及示例代4102码:url = "http://www.crifan.com";

respHtml = getUrlRespHtml(url);

完全库函数,自己搜:

crifanLib.py

关于1653抓取动态页面,详见:

Python专题教程:抓取网站,模拟登陆,抓取动态网页

(自己搜标题即可找到)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值