使用phantomjs&casperjs+jsoup获取网页内容

-------------

// http://casperjs.org/

var casper = require('casper').create({
  pageSettings: {
    loadImages:  false, // 不加载图片,减少请求
  }
});

var fs = require('fs');

casper.userAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36');

var casper = require('casper').create();
var url = casper.cli.args;
console.log("casper.cli.args:",url);
var fetchUrl='http://xian.qq.com/l/dqlv/lyzx/dqlyzx.htm'         //动态获取参数

casper.start(fetchUrl, function() {
    this.echo(this.getHTML('div .main',true));
    //fs.write("daqin.html",this.getHTML('div.main'),'w')
//this.echo(this.getHTML('div.search_notes', true)); 
//fs.write("1.html", this.getHTML('div.search_feed div.search_notes'), 'w')
});
function getTextContent(strRule, strMesg)
{
//给evaluate传入参数
var textinfo = casper.evaluate(function(rule) {
    var valArr = '';
    $(rule).each(function(index,item){
    valArr = valArr + $(this).text() + ',';
      });
    return valArr.substring(0,valArr.length-1);
}, strRule);
    casper.echo(strMesg);
    require('utils').dump(textinfo.split(','));
    return textinfo.split(',');
};

casper.run();

---------------------

public class Daqin {

//jsoup细化解析html
public static void main(String[] args) {
try {
String content = getAjaxCotnent("http://xian.qq.com/l/dqlv/lyzx/dqlyzx.htm");
Document doc = Jsoup.parse(content); 
Elements result1 = doc.getElementsByTag("li");
Elements result2 = doc.getElementsByTag("a");
Elements result3 = doc.getElementsByTag("span");
for(Element element1 : result1){
System.out.println(element1.text());
}
for(Element element2 : result2){
System.out.println(element2.text());
}
for(Element element3 : result3){
System.out.println(element3.text());
}
} catch (IOException e) {
e.printStackTrace();
}
}


public static String getAjaxCotnent(String url) throws IOException {
Runtime rt = Runtime.getRuntime();

Process p = rt.exec("C:\\casperjs\\bin\\" + "casperjs.exe " + "C:\\casperjs\\tests\\" + "daqin.js " + url);
InputStream is = p.getInputStream();
//BufferedReader br = new BufferedReader(new InputStreamReader(is));
BufferedReader br = new BufferedReader(new InputStreamReader(is, "gbk")); 
StringBuffer sbf = new StringBuffer();
String tmp = "";
while ((tmp = br.readLine()) != null) {
sbf.append(tmp.trim());
}
System.out.println("---------"+sbf.toString());
return sbf.toString();
}


}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值