1. 引言:
爬虫要爬取动态页面的信息,采用Selenium+PhantomJS是不错的选择。遗憾的是PhantomJS不能下载非html文件,这多少是个遗憾。
但基于PhantomJS的CasperJS却有下载功能。于是有人分析其中的奥秘[1].
其关键点在于Ajax!
直接用selenium的get()打开下载资源无法获取数据。
2.实现方法:
依照原理,摸索出了“Selenium+PhantomJS 实现非html文件下载”的方法:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
#<strong>From CasperJS code</strong>
FuncionsJS = r"""
function sendAJAX(url, method, data, async) {
var xhr = new XMLHttpRequest(),
dataString = "",
dataList = [];
method = method && method.toUpperCase() || "GET";
xhr.open(method, url, !!async);
console.log("Jude-Log sendAJAX(): Using HTTP method: '" + method + "'", "debug"); >
xhr.overrideMimeType("text/plain; charset=x-user-defined");
if (method === "POST") {
if (typeof data === "object") {
for (var k in data) {
dataList.push(encodeURIComponent(k) + "=" + encodeURIComponent(data[k].toString()));
}
dataString = dataList.join('&');
console.log("Jude-Log sendAJAX(): Using request data: '" + dataString + "'", "debug");
} else if (typeof data === "string") {
dataString = data;
}
xhr.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
}
xhr.send(method === "POST" ? dataString : null);
//console.log("Jude-Log xhr.responseText : " + xhr.responseText);
return xhr.responseText;
};
function getBinary(url, method, data) {
try {
return sendAJAX(url, method, data, false);
} catch (e) {
if (e.name === "NETWORK_ERR" && e.code === 101) {
console.log(" Jude-Log getBinary(): Unfortunately, casperjs cannot make cross domain ajax requests", "warning");
}
console.log("Jude-Log getBinary(): Error while fetching " + url + ": " + e, "error");
return "";
}
};
function download(url) {
try {
return getBinary(url, "GET", "");
} catch (e) {
console.log( "Error while downloading %s ", e );
return "Error -_-";
}
};
//return download("http://www.jb51.net/images/logo.gif" );
"""
def SaveData(data, targetFile):
with open(targetFile, "wb") as fp:
for ch in data:
try:
fp.write(ch);
except Exception, e:
tmp = str(e);
flag = r"character u'\uf7";
idx = tmp.find(flag);
dd = tmp[idx+len(flag) : idx+len(flag)+2];
dd = int(dd, 16)
dd = "%c" % dd
fp.write(dd);
return;
def Download(driver, url, targetFile):
CallJS = 'return download("' +url+ '");'
data = driver.execute_script(FuncionsJS+CallJS);
SaveData(data, targetFile);
return;
if "__main__"==__name__:
try:
service_args_obj = [
'--proxy=localhost:8080',
'--proxy-type=http',
"--web-security=false",
]
DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.webSecurityEnabled'] = False;
driver=webdriver.PhantomJS("phantomjs",
desired_capabilities=DesiredCapabilities.PHANTOMJS
#,service_args=service_args_obj
);
driver.get("http://android.d.cn");
Download(driver, "http://raw.android.d.cn/cdroid_res/web/news20151016/img/logo.png", "logo.png")
for entry in driver.get_log('browser'):
print " *** LOG", entry
pass
raw_input("...");
driver.quit()
except:
print (">>>"+ traceback.format_exc());
pass
这种方法能否下载大文件没有测试!
3. 参考:
【1】http://www.cnblogs.com/kavmors/p/4744445.html
【2】http://docs.casperjs.org/en/latest/faq.html?highlight=download#i-m-having-hard-times-downloading-files-using-download