最近一直想做自己的淘客网站,想到用该工具抓数据,但是使用中发现抓取失败率太高了。因此打算换成抓包分析接口获取,此贴仅记录学习过程,项目基于springboot,提供关键代码,其他配置就不提供了,都是比较简单的,如果有知道为什么失败率那么高也请告诉我(页面都是能访问的,但是跑的时候会有很多提示无法下载),其中用到了jpa保存结果数据,但由于WebMagic启动方式是用new的因此spring配置无法获取数据库链接, 保存那里可以忽略
import com.qbin.crawlers.common.globalconst.RuleConst;
import com.qbin.crawlers.common.util.CrawlerUtil;
import com.qbin.crawlers.crawler.model.Goods;
import com.qbin.crawlers.crawler.repository.CrawlerGoodsRepository;
import org.springframework.beans.factory.annotation.Autowired;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 描述:淘宝客数据爬取
* author qiaobin 2016/9/29 17:34.
*/
public class TaoBaoZhuShouProcessor implements PageProcessor {
@Autowired
private CrawlerGoodsRepository goodsRepository;
private Site site = Site.me().setRetryTimes(10).setSleepTime(1000);
//计数器
private static int count = 0;
private static int type = 0;
@Override
public void process(Page page) {
//首页 (抓商品链接 、分页链接)
//分页 (抓商品链接)
//商品细览页 (抓数据)
if (CrawlerUtil.isMainPage(page)) { //首页处理
//1.查看商品类别
type = CrawlerUtil.goodsType(page);
//2.将商品图片链接加入到待抓取列表中
page.addTargetRequests(page.getHtml().xpath(RuleConst.GOODSLISTRULE).links().all()); //goodslist
//3.将分页信息加入到待抓取列表中
String lastPageUrl = CrawlerUtil.getLastPageUrl(page, RuleConst.LASTPAGEURL);
int lastPageNo = CrawlerUtil.getLastPageNo(lastPageUrl);
page.addTargetRequests(CrawlerUtil.getPageUrls(lastPageNo, lastPageUrl));
} else if (page.getUrl().regex(RuleConst.PAGEREGEX).match()){
//将商品图片链接加入到待抓取列表中
page.addTargetRequests(page.getHtml().xpath(RuleConst.GOODSLISTRULE).links().all()); //goodslist
} else {
//正文爬取
count ++;
String title = CrawlerUtil.getValue(page, RuleConst.TITLE); //标题
String picture = CrawlerUtil.getValue(page, RuleConst.PICTRURE);
String describe = CrawlerUtil.getValue(page, RuleConst.DESCRIBE); //商品描述
String quanhoujia = CrawlerUtil.getValue(page, RuleConst.QUANHOUJIA); //券后价
String zaishoujia = CrawlerUtil.getValue(page, RuleConst.ZAISHOUJIA); //在售价
String coupon = CrawlerUtil.getValue(page, RuleConst.YOUHUIQUAN); //优惠券金额
String couponRemark = CrawlerUtil.getValue(page, RuleConst.YOUHUIQUANBEIZHU); //优惠券说明
String commission = CrawlerUtil.getValue(page, RuleConst.YONGJIN); //佣金
String wenan = CrawlerUtil.getValue(page, RuleConst.WENAN); //文案
String pcyouhuihref = CrawlerUtil.getValue(page, RuleConst.PCYOUHUIHREF); //PC领取优惠券入口
String phoneyouhuihref = CrawlerUtil.getValue(page, RuleConst.PHONEYOUHUIHREF); //手机领取优惠券入口
String goodshref = CrawlerUtil.getValue(page, RuleConst.GOODSHREF); //商品链接
Goods goods = new Goods();
goods.setTitle(title);
goods.setPicture(picture);
goods.setDescribe(describe);
goods.setCommission(commission);
goods.setCoupon(coupon);
goods.setCouponRemark(couponRemark);
goods.setWenan(wenan);
goods.setDiscountedPrice(quanhoujia);
goods.setPrice(zaishoujia);
goods.setPcCouponHref(pcyouhuihref);
goods.setPhoneCouponHref(phoneyouhuihref);
goods.setGoodsHref(goodshref);
goods.setGoodsType(type);
goodsRepository.save(goods);
}
}
@Override
public Site getSite() {
return this.site;
}
public static void main(String[] args) {
long beginTime ,endTime;
System.out.println("========淘客助手爬虫【启动】喽!=========");
Spider.create(new TaoBaoZhuShouProcessor()).addUrl(RuleConst.NVZHUANG)
.thread(5).run();
//
System.out.println(String.format("共抓取%s条数据", count));
System.out.println("========淘客助手爬虫【结束】喽!=========");
}
}
import java.util.ArrayList;
import java.util.List;
/**
* 描述:常量定义
* author qiaobin 2016/10/10 15:03.
*/
public final class RuleConst {
//淘客助手首页
public final static String TAOKEZHUSHOU = "http://www.taokezhushou.com/";
//1女装
public final static String NVZHUANG = "http://www.taokezhushou.com/cate/1";
//2男装
public final static String NANZHUANG = "http://www.taokezhushou.com/cate/2";
//3鞋包
public final static String XIEBAO = "http://www.taokezhushou.com/cate/3";
//5家居
public final static String JIAJU = "http://www.taokezhushou.com/cate/5";
//6文体
public final static String WENTI = "http://www.taokezhushou.com/cate/6";
//7电器
public final static String DIANQI = "http://www.taokezhushou.com/cate/7";
//8配饰
public final static String PEISHI = "http://www.taokezhushou.com/cate/8";
//9数码
public final static String SHUMA = "http://www.taokezhushou.com/cate/9";
//10美食
public final static String MEISHI = "http://www.taokezhushou.com/cate/10";
//11美妆
public final static String MEIZHUANG = "http://www.taokezhushou.com/cate/11";
//12母婴
public final static String MUYING = "http://www.taokezhushou.com/cate/12";
//13其他
public final static String QITA = "http://www.taokezhushou.com/cate/13";
//商品类别列表
public final static List<String> MAINPAGELIST = new ArrayList<>();
static {
MAINPAGELIST.add(NVZHUANG);
MAINPAGELIST.add(NANZHUANG);
MAINPAGELIST.add(XIEBAO);
MAINPAGELIST.add(JIAJU);
MAINPAGELIST.add(WENTI);
MAINPAGELIST.add(DIANQI);
MAINPAGELIST.add(PEISHI);
MAINPAGELIST.add(SHUMA);
MAINPAGELIST.add(MEISHI);
MAINPAGELIST.add(MEIZHUANG);
MAINPAGELIST.add(MUYING);
MAINPAGELIST.add(QITA);
}
//首推商品规则(4条)
public final static String TOP4RULE = "//div[@class='goods clearfix']/div/ul[@class='mg clearfix']/li/div/a";
//展示的商品(100条)
public final static String GOODSLISTRULE = "//div[@class='goods']/div/ul/li/div/a";
// //最大页号
// public final static String LASTPAGENO = "//div[@class='pages wth']/ul/li[last()]/a/text()";
//最大页号链接(需要拿出倒数第二个)
public final static String LASTPAGEURL = "//div[@class='pages wth']/ul/li/a";
// //最大页号链接
// public final static String LASTPAGEURL = "//div[@class='pages wth']/ul/li[last()-1]//a/@href";
//分页规则
public final static String PAGEREGEX = "http://www.taokezhushou.com/cate/[\\d\\d]+\\?page=[\\d\\d]+#new";
//####################商品########################
//商品图片
public final static String PICTRURE = "//div[@class='goods-img fl']/a/img/@src";
//标题
public final static String TITLE = "//div[@class='goods-intro fr']/div/div/h3/text()";
//商品描述
public final static String DESCRIBE = "//div[@class='goods-intro fr']/div[@class='intro']/p/text()";
//券后价
public final static String QUANHOUJIA = "//div[@class='intro1']/ul/li[@class='tro1 fl']/span/text()";
//在售价
public final static String ZAISHOUJIA = "//div[@class='intro1']/ul/li[@class='tro2 fl']/text()";
//优惠券
public final static String YOUHUIQUAN = "//div[@class='intro2 clearfix']/p[@class='int1 fl']/span/text()";
//优惠券备注
public final static String YOUHUIQUANBEIZHU = "//div[@class='intro2 clearfix']/p[@class='int2 fl']/text()";
//佣金
public final static String YONGJIN = "//div[@class='intro4-left fl']/ul/li[@class='intr1']/span/text()";
//优惠券PC
public final static String PCYOUHUIHREF = "//div[@class='intro4-left fl']/p[1]/a[1]/@href";
//优惠券手机
public final static String PHONEYOUHUIHREF = "//div[@class='intro4-left fl']/p[1]/a[2]/@href";
//商品链接
public final static String GOODSHREF = "//div[@class='intro4-left fl']/p[2]/a/text()";
//文案
public final static String WENAN = "//div[@id='wenan']";
}
import com.qbin.crawlers.common.globalconst.RuleConst;
import us.codecraft.webmagic.Page;
import java.util.ArrayList;
import java.util.List;
/**
* 描述:爬虫工具类
* author qiaobin 2016/10/10 15:37.
*/
public final class CrawlerUtil {
/**
* 功能描述:根据xpath查询符合条件列表
*/
public final static List<String> getLinksList(Page page, String xpath) {
return page.getHtml().xpath(xpath).links().all();
}
/**
* 功能描述:根据xpath查询值
*/
public final static String getValue(Page page, String xpath) {
return page.getHtml().xpath(xpath).get();
}
/**
* 功能描述:是否是主页面
*/
public static boolean isMainPage(Page page) {
if (RuleConst.MAINPAGELIST.contains(page.getUrl().toString())) {
return true;
}
return false;
}
/**
* 功能描述:返回商品类型
*/
public static int goodsType(Page page) {
int type = 0;
switch (page.getUrl().toString()) {
case RuleConst.NVZHUANG : type = 1; break;
case RuleConst.NANZHUANG : type = 2; break;
case RuleConst.XIEBAO : type = 3; break;
case RuleConst.JIAJU : type = 5; break;
case RuleConst.WENTI : type = 6; break;
case RuleConst.DIANQI : type = 7; break;
case RuleConst.PEISHI : type = 8; break;
case RuleConst.SHUMA : type = 9; break;
case RuleConst.MEISHI : type = 10; break;
case RuleConst.MEIZHUANG : type = 11; break;
case RuleConst.MUYING : type = 12; break;
}
return type;
}
/**
* 功能描述:返回分页链接
*/
public static List<String> getPageUrls(int lastPageNo, String lastPageUrl) {
List<String> list = new ArrayList<>();
String head = lastPageUrl.substring(0,lastPageUrl.indexOf("page=")+5);
String tail = lastPageUrl.substring(lastPageUrl.lastIndexOf("#new"));
for (int i=1 ; i<=lastPageNo; i++) {
list.add(head + i + tail); //拼出所有分页链接
}
return list;
}
/**
* 功能描述:最后一页链接
*/
public static String getLastPageUrl(Page page, String regex) {
List<String> urls = CrawlerUtil.getLinksList(page, regex);
return urls.get(urls.size() - 2);
}
/**
* 功能描述:最后一页号
*/
public static int getLastPageNo(String lastPageUrl) {
String pageNo = lastPageUrl.substring(lastPageUrl.lastIndexOf("page=")+5,lastPageUrl.lastIndexOf("#new"));
return Integer.parseInt(pageNo);
}
}