2.4.5. 获得Url返回的HTML网页(源码)内容:getUrlRespHtml
#------------------------------------------------------------------------------
# get response html==body from url
#def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :
def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True) :
resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip);
respHtml = resp.read();
if(useGzip) :
#print "---before unzip, len(respHtml)=",len(respHtml);
respInfo = resp.info();
# Server: nginx/1.0.8
# Date: Sun, 08 Apr 2012 12:30:35 GMT
# Content-Type: text/html
# Transfer-Encoding: chunked
# Connection: close
# Vary: Accept-Encoding
# ...
# Content-Encoding: gzip
# sometime, the request use gzip,deflate, but actually returned is un-gzip html
# -> response info not include above "Content-Encoding: gzip"
# eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html
# -> so here only decode when it is indeed is gziped data
if( ("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")) :
respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS);
#print "+++ after unzip, len(respHtml)=",len(respHtml);
return respHtml;
例 2.24. getUrlRespHtml的使用范例:不带额外参数
respHtml = getUrlRespHtml(url);
例 2.25. getUrlRespHtml的使用范例:带额外参数
modifyUrl = gVal['blogEntryUrl'] + "/blog/submit/modifyblog";
#logging.debug("Modify Url is %s", modifyUrl);
#http://hi.baidu.com/wwwhaseecom/blog/item/79188d1b4fa36f068718bf79.html
foundSpBlogID = re.search(r"blog/item/(?P\w+?).html", url);
if(foundSpBlogID) :
spBlogID = foundSpBlogID.group("spBlogID");
logging.debug("Extracted spBlogID=%s", spBlogID);
else :
modifyOk = False;
errInfo = "Can't extract post spBlogID !";
return (modifyOk, errInfo);
newPostContentGb18030 = newPostContentUni.encode("GB18030");
categoryGb18030 = infoDict['category'].encode("GB18030");
titleGb18030 = infoDict['title'].encode("GB18030");
postDict = {
"bdstoken" : gVal['spToken'],
"ct" : "1",
"mms_flag" : "0",
"cm" : "2",
"spBlogID" : spBlogID,
"spBlogCatName_o": categoryGb18030, # old catagory
"edithid" : "",
"previewImg" : "",
"spBlogTitle" : titleGb18030,
"spBlogText" : newPostContentGb18030,
"spBlogCatName" : categoryGb18030, # new catagory
"spBlogPower" : "0",
"spIsCmtAllow" : "1",
"spShareNotAllow":"0",
"spVcode" : "",
"spVerifyKey" : "",
}
headerDict = {
# 如果不添加Referer,则返回的html则会出现错误:"数据添加的一般错误"
"Referer" : gVal['blogEntryUrl'] + "/blog/modify/" + spBlogID,
}
respHtml = getUrlRespHtml(modifyUrl, postDict, headerDict);