webmagic保存html页面,webmagic爬虫对静态页面,动态页面及js请求方式爬取的处理...

webmagic爬取网页数据,【分页爬取内容】见上一篇博文https://segmentfault.com/a/1190000020005655

webmagic的官方文档见: http://webmagic.io/docs/zh/ 可查阅根据不同选择器 获取节点信息等内容

网页内容爬取根据页面生成方式基本上可通过以下方法爬取:

一.静态页面【最常见的】,能通过webmagic的常规方法直接爬取数据

二.一些动态生成网页,需要在爬虫程序里使用浏览器驱动将数据渲染到页面上之后再爬取

三.从js请求中能获取数据的网页,可直接构造http请求获取数据

下文将罗列针对这三种爬取方式的webmagic使用,文章较长,可根据你的需要【爬取方式】取用

一. 静态页面爬取

示例:

import com.boe.mps.jrj.dataas.entity.BigDeposit;

import org.joda.time.DateTime;

import org.springframework.stereotype.Repository;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;

import java.util.List;

/**

* 大额存单爬虫

*/

@Repository

public class BigDepositProcessor implements PageProcessor{

private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);

@Override

public Site getSite() {

return site;

}

@Override

public void process(Page page) {

// 数据更新时间

String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);

// 获取数据节点

List nodes = page.getHtml().$(".ebdp-pc4promote-circularcontainer-wrapper").nodes();

List list = new ArrayList<>();

for (int i = 0; i < nodes.size(); i++) {

BigDeposit bigDeposit = new BigDeposit();

// 根据xpth获取table列表中的td节点

List table = page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td").nodes();

bigDeposit.setItemName(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[1]/span[1]/span/text()").get());

bigDeposit.setItemRate(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[2]/a/text()").get());

bigDeposit.setUpdateTime(updateTime);

if (table.size()<=7){

bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[4]/text()").get());

bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[6]/text()").get());

}else {

bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[5]/text()").get());

bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[7]/text()").get());

}

list.add(bigDeposit);

}

page.putField("bigDeposit",list);

//打印爬取的内容

list.forEach(e->{

System.out.println(e);

});

}

// 爬虫测试方法

public static void main(String[] args) {

// 爬虫爬取路径

String bigdeposit="https://mybank.icbc.com.cn/servlet/ICBCBaseReqServletNoSession?dse_operationName=per_accountQueryFixedProductsOutOp&cmd=0&NormalOrBooking=0&IN_CURRFLAG=&IN_APPID=02&IN_SAVETYPE=&IN_BIGFLAG=1&JJGFLAG=0&Area_code=1001";

//爬虫构造,将爬取结果打印到控制台上

Spider.create(new BigDepositProcessor()).addPipeline(new ConsolePipeline()).addUrl(bigdeposit).thread(5).run();

}

}

/**

*BigDeposit的实体类

*/

@Data

public class BigDeposit {

private Long id;

/**产品名称*/

private String itemName;

/**产品利率%*/

private String itemRate;

/**起存金额(元)*/

private String startDepositPrice;

/**交易级差*/

private String grading;

/**更新时间*/

private String updateTime;

}

二. 动态页面chromedriver先渲染再爬取

webmagic调用chromedriver驱动,先渲染页面,再爬取数据

示例:

import com.boe.mps.jrj.dataas.entity.Bond;

import org.joda.time.DateTime;

import org.springframework.stereotype.Repository;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;

import java.util.List;

/**

*全部债券产品

*/

@Repository

public class BondProcessor implements PageProcessor{

String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);

long lastTime = DateTime.now().getMillis();

private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);

@Override

public Site getSite() {

return site;

}

@Override

public void process(Page page) {

long execTime = DateTime.now().getMillis();

//时间超过一分钟才重新赋值更新时间,否则不赋值!解决分页数据,多次执行时间不一致的问题

if((execTime - lastTime) > 1000*60){

updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);

lastTime = execTime;

}

// 获取分页数

String s = page.getHtml().xpath("//*[@id=lbInfo]").get();

String sum = s.substring(s.indexOf("总记录数:") + 5, s.indexOf("条"));

System.err.println("ss=="+sum);

int total = Integer.parseInt(sum);

for (int i = 0; i < total; i+=8) {

String nextUrl="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos="+(i+1);

// 分页爬取,将下一页的url放入爬虫任务列表里

page.addTargetRequest(nextUrl);

}

List nodes1 = page.getHtml().xpath("//*[@id=ebdp-pc4promote-nationaldebtList]/div").nodes();

System.out.println("tiaoshu="+nodes1.size());

List list = new ArrayList<>();

for (int i = 1; i <=nodes1.size() ; i++) {//*[@id="ebdp-pc4promote-nationaldebtList"]/div[2]/div[1]/div[1]/div[1]/a

Bond bond = new Bond();

String s0 = page.getHtml().xpath("//*[@id=ebdp-pc4promote-nationaldebtList]/div["+i+"]/div[3]").$(".ebdp-pc4promote-tuijian").get()==null?"":"推荐";

String itemname = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/a/text()").get();

String s1 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[1]/text()").get();

String s2 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[2]/text()").get();

String s3 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[3]/text()").get();

String tradingtime = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[2]/span[2]/text()").get();

String clientbuyingrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[2]/span/text()").get();

String clientbuyingprice=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[3]/b/text()").get();

String clientsellrate=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[2]/span/text()").get();

String clientsellprice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[3]/b/text()").get();

String s4 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/text()").get();

String s5 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/b/text()").get();

String couponrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[1]/text()").get();

String accruedInterest = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[2]/text()").get();

String currentPaymentDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[3]/text()").get();

String currentInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[4]/text()").get();

String interestFrequency = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[5]/text()").get();

String couponBondValue = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[6]/text()").get();

String expireDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[7]/text()").get();

String holdExpireInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[8]/text()").get();

String clientBuyingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[9]/text()").get();

String clientSellingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[10]/text()").get();

String itemType = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[11]/text()").get();

bond.setItemName(itemname);

bond.setItemFeature(s0+" "+s1+" "+s2+" "+s3);

bond.setTradingHours(tradingtime);

bond.setClientBuyingRate(clientbuyingrate);

bond.setClientBuyingFullPrice(clientbuyingprice);

bond.setClientSellingRate(clientsellrate);

bond.setClientSellingFullPrice(clientsellprice);

bond.setRemainTimeLimit(s4+s5);

bond.setCouponRate(couponrate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setAccruedInterest(accruedInterest.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setCurrentPaymentDate(currentPaymentDate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setCurrentInterestIncome(currentInterestIncome.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setInterestFrequency(interestFrequency.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setCouponBondValue(couponBondValue.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setExpireDate(expireDate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setHoldExpireInterestIncome(holdExpireInterestIncome.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setClientBuyingNetPrice(clientBuyingNetPrice.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setClientSellingNetPrice(clientSellingNetPrice.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setItemType(itemType.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");

bond.setUpdateTime(updateTime);

list.add(bond);

}

page.putField("bond",list);

list.forEach(e->{

System.out.println(e);

});

}

// 测试示例

public static void main(String[] args) {

String url="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos=1";

// 获取系统中chromedriver_linux64的配置

System.setProperty("selenuim_config", "/home/myfile/tool_station/chromedriver_linux64/config.ini");

// 设置SeleniumDownloader驱动的爬取方式

Spider.create(new BondProcessor()).thread(1)

.addPipeline(new ConsolePipeline())

.addUrl(url)

.setDownloader(new SeleniumDownloader("/home/myfile/tool_station/chromedriver_linux64/chromedriver").setSleepTime(1000))

.run();

}

}

chromedriver驱动的下载及配置

chromedriver下载路径[http://chromedriver.storage.googleapis.com/index.html](http://chromedriver.storage.googleapis.com/index.html)

【请下载与你的浏览器版本相同的chromedriver包】

config.ini文件配置如下:

driver=chrome

#chrome_exec_path=/usr/bin/google-chrome-stable

chrome_driver_loglevel=DEBUG

本例实体类如下:

@Data

public class Bond {

private Long id;

/**债券名称*/

private String itemName;

/**客户买入到期收益率*/

private String clientBuyingRate;

/**客户买入交易全价*/

private String clientBuyingFullPrice;

/**客户卖出收益率*/

private String clientSellingRate;

/**客户卖出交易全价*/

private String clientSellingFullPrice;

/**票面利率*/

private String couponRate;

/**应计利息*/

private String accruedInterest;

/**本期付息日*/

private String currentPaymentDate;

/**本期利息收益*/

private String currentInterestIncome;

/**付息频率*/

private String interestFrequency;

/**券债面值*/

private String couponBondValue;

/**到期日*/

private String expireDate;

/**持有到期利息收益*/

private String holdExpireInterestIncome;

/**客户买入净价*/

private String clientBuyingNetPrice;

/**客户卖出净价*/

private String clientSellingNetPrice;

/**债券类型*/

private String itemType;

/**债券的特点*/

private String itemFeature;

/**剩余期限*/

private String remainTimeLimit;

/**交易时间*/

private String tradingHours;

/**更新时间*/

private String updateTime;

}

三.js中请求能看到数据的动态渲染网页

示例:

import com.boe.mps.jrj.dataas.entity.ExchangeMarket;

import org.joda.time.DateTime;

import org.springframework.stereotype.Repository;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Json;

import us.codecraft.webmagic.selector.JsonPathSelector;

import java.util.*;

@Repository

public class ExchangeMarketProcessor implements PageProcessor{

private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);

@Override

public Site getSite() {

return site;

}

@Override

public void process(Page page) {

String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);

String json = page.getJson().get();

System.out.println("=="+json);

// post请求获取数据,取值方式如下

List strings = new JsonPathSelector("$.rf").selectList(page.getRawText());

List list = new ArrayList<>();

for (String str:strings) {

Json item = new Json(str);

ExchangeMarket exchangeMarket = new ExchangeMarket();

exchangeMarket.setItemName(item.jsonPath("$.proName").get());

exchangeMarket.setRisefall(item.jsonPath("$.riseSign").get());

exchangeMarket.setBankBuyingPrice(item.jsonPath("$.buyRate").get());

exchangeMarket.setBankSellingPrice(item.jsonPath("$.sellRate").get());

exchangeMarket.setMiddlePrice(item.jsonPath("$.middPrice").get());

exchangeMarket.setDayRisefallRange(item.jsonPath("$.openprice_dr").get());

exchangeMarket.setDayRisefallValue(item.jsonPath("$.openprice_dv").get());

exchangeMarket.setYearRisefallRange(item.jsonPath("$.openprice_yr").get());

exchangeMarket.setUpdateTime(updateTime);

list.add(exchangeMarket);

}

page.putField("exchangeMarket",list);

}

public static void main(String[] args) {

String agriculturalUrl="https://mybank.icbc.com.cn/ctp/ctpservlet/EbdpAjaxServlet";

Request exchangeMarketRequest = new Request(agriculturalUrl);

exchangeMarketRequest.setMethod(HttpConstant.Method.POST);

// 构造post请求及参数设置

Map agriculturalMap = new HashMap<>();

agriculturalMap.put("tranCode","A00513");

exchangeMarketRequest.setRequestBody(HttpRequestBody.form(agriculturalMap,"utf-8"));

Spider.create(new ExchangeMarketProcessor()).addPipeline(new ConsolePipeline()).addRequest(exchangeMarketRequest).thread(1).run();

}

}

该例中实体类如下:

@Data

public class ExchangeMarket {

/**主键*/

private Long id;

/**

*品种

*/

private String itemName;

/**

*涨跌

*/

private String risefall;

/**

*银行买入价

*/

private String bankBuyingPrice;

/**

*银行卖出价

*/

private String bankSellingPrice;

/**

*中间价

*/

private String middlePrice;

/**

*当日涨跌值

*/

private String dayRisefallValue;

/**

*当日涨跌幅

*/

private String dayRisefallRange;

/**

*当年涨跌幅

*/

private String yearRisefallRange;

/**

* 更新时间

*/

private String updateTime;

}

该例的请求url【见上文代码】https://mybank.icbc.com.cn/ct...

js请求的可以先postman调用一下,确认请求方式及需要的参数等内容。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值