webmagic实例

本实例是拉取应用包信息:功能单一大家别建议

package com.energy.yyb;


import java.util.ArrayList;
import java.util.List;
import java.util.Map;


import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;


@SuppressWarnings("unused")
public class TestMain {
public static void main(String[] args) {
DAO dao = new DAO();
List<TApp> applist = dao.getForList(TApp.class, "select * from t_app", new Object[]{});
for(TApp app:applist){
System.out.println(app.getPakeage());
Spider spider = Spider.create(new PageCategory(app.getPakeage(),dao))
         .addUrl("http://sj.qq.com/myapp/detail.htm?apkName="+app.getPakeage())
         .setScheduler(new  FileCacheQueueScheduler("D:\\appDataGet"));
spider.thread(10);
spider.run();
}



}
}

package com.energy.yyb;


import java.util.List;




import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
 * 鑾峰彇绫诲埆
 * @author chungong.zhang
 *
 */
public class PageCategory  implements PageProcessor{
private Site site = Site.me().setRetryTimes(3);
public String pk = "";
public DAO dao ;
@Override
public Site getSite() {
return site;
}

public PageCategory(String pk,DAO dao){
this.pk=pk;
this.dao=dao;
}


@Override
public void process(Page page) {//
/*    page.addTargetRequests(page.getHtml().links().regex("^(http://sj.qq.com/myapp/detail.htm(\\?|&+)(.+?)=([^&]*))$").all());
   page.addTargetRequests(page.getHtml().links().regex("^(http://sj.qq.com/myapp/category.htm?orgame=1&categoryId=(\\?|&+)(.+?)=([^&]*))$").all());*/
   String[] ii = page.getUrl().toString().split("apkName=");
 /*  if(null != ii &&ii.length>1){
    this.pk=ii[1];
   }*/
   List<String> iconList = page.getHtml().xpath("//div[@class='det-icon']/img/@src").all();
List<String> namelist = page.getHtml().xpath("//div[@class='det-name-int']/text()").all();
List<String> info = page.getHtml().xpath("//div[@class='det-app-data-info']/text()").all();

List<String> sizeInfo = page.getHtml().xpath("//div[@class='det-size']/text()").all();

List<String> typeInfo = page.getHtml().xpath("//a[@class='det-type-link']/text()").all();


List<String> pfInfo = page.getHtml().xpath("//div[@class='com-blue-star-num']/text()").all();

List<String> downum = page.getHtml().xpath("//div[@class='det-ins-num']/text()").all();
System.out.println("下载数:"+downum);
//det-comment-num
List<String> appUrlList = page.getHtml().xpath("//a[@class='det-down-btn']/@data-apkurl").all();
List<String> bimgList = page.getHtml().xpath("//div[@class='pic-img-box']/img/@data-src").all();
List<String> developeList = page.getHtml().xpath("//div[@class='det-othinfo-data']/text()").all();
StringBuffer insertsql = new StringBuffer("INSERT INTO t_app_2 ( name, pakeage, embeded_pakeage, main_class, embeded_main_class, "
+ "develope, heat, subject_id, category_id, cuptime, uptime, ctime, version_code, version, "
+ "embeded_version, up_desc, app_desc, app_size, embeded_app_size, icon, app, embeded_app, "
+ "notify, notify_status, share_content, main_pic, main_pic2, down_cnt, pic1, pic2, pic3, pic4,"
+ " pic5, app_source, app_status, source_url, support, os_version_min, app_tag, single_word, age_limit, "
+ "down_num, md5_code, landscapemode, video) VALUES (");
if(null != namelist && namelist.size()>0){
insertsql.append("'"+namelist.get(0).replace("'", "")+"',");//name
}else{
page.setSkip(true);
return ;
}
insertsql.append("'"+pk+"',");//pakeage
List<TApp> applist = dao.getForList(TApp.class, "select * from t_app_2 where pakeage=?", new Object[]{pk});
if(null != applist &&applist.size()>0){
page.setSkip(true);
return ;
}
insertsql.append("'',");//embeded_pakeage
insertsql.append("'',");//embeded_main_class
insertsql.append("'',");
if(null != developeList&&developeList.size()>3){
insertsql.append("'"+developeList.get(2).replace("'", "")+"',");//develope
}else{
insertsql.append("'',");//develope
}
insertsql.append("'4',");//heat
insertsql.append("'0',");//subject_id
insertsql.append("'0',");//category_id
insertsql.append("now(),");//cuptime
insertsql.append("now(),");//uptime
insertsql.append("now(),");//ctime
insertsql.append("'0',");//version_code
if(null != developeList&&developeList.size()>0){
insertsql.append("'"+developeList.get(0).replace("'", "")+"',");//version
}else{
insertsql.append("'',");//version
}
insertsql.append("'',");//embeded_version
if(null != info&&info.size()>0){
insertsql.append("'"+info.get(0).replace("'", "")+"',");//app_desc
}else{
insertsql.append("'',");//app_desc
}
if(null != info&&info.size()>0){
insertsql.append("'"+info.get(0).replace("'", "")+"',");//app_desc
}else{
insertsql.append("'',");//app_desc
}
if(null != sizeInfo&&sizeInfo.size()>0){
insertsql.append("'"+getAppSize(sizeInfo.get(0)+"B")+"',");//app_size
}else{
insertsql.append("'0',");//app_desc
}
insertsql.append("'0',");//embeded_app_size
if(null != iconList && iconList.size()>0){
insertsql.append("'"+iconList.get(0).replace("'", "")+"',");//icon 
}else{
insertsql.append("'',");//icon 
}
if(null != appUrlList&&appUrlList.size()>0){
insertsql.append("'"+appUrlList.get(0).replace("'", "")+"',");//app
}else{
insertsql.append("'',");//app
}
insertsql.append("'0',");//embeded_app_size
insertsql.append("'0',");//notify
insertsql.append("'0',");//notify_status
insertsql.append("'"+downum+"',");//share_content
insertsql.append("'',");//main_pic
insertsql.append("'',");//main_pic2
insertsql.append("'0',");//down_cnt
if(null != bimgList&&bimgList.size()>0){
if(null != bimgList.get(0)&&!"".equals(bimgList.get(0))&&!"null".equals(bimgList.get(0))){
insertsql.append("'"+bimgList.get(0).replace("'", "")+"',");//pic1
}else{
insertsql.append("'',");//pic1
}
}else{
insertsql.append("'',");//pic1
}
if(null != bimgList&&bimgList.size()>1){
if(null != bimgList.get(1)&&!"".equals(bimgList.get(1))&&!"null".equals(bimgList.get(1))){
insertsql.append("'"+bimgList.get(1).replace("'", "")+"',");//pic1
}else{
insertsql.append("'',");//pic1
}
}else{
insertsql.append("'',");//pic1
}
if(null != bimgList&&bimgList.size()>2){
if(null != bimgList.get(2)&&!"".equals(bimgList.get(2))&&!"null".equals(bimgList.get(2))){
insertsql.append("'"+bimgList.get(2).replace("'", "")+"',");//pic1
}else{
insertsql.append("'',");//pic1
}
}else{
insertsql.append("'',");//pic1
}
if(null != bimgList&&bimgList.size()>3){
if(null != bimgList.get(3)&&!"".equals(bimgList.get(3))&&!"null".equals(bimgList.get(3))){
insertsql.append("'"+bimgList.get(3).replace("'", "")+"',");//pic1
}else{
insertsql.append("'',");//pic1
}
}else{
insertsql.append("'',");//pic1
}
if(null != bimgList&&bimgList.size()>4){
if(null != bimgList.get(4)&&!"".equals(bimgList.get(4))&&!"null".equals(bimgList.get(4))){
insertsql.append("'"+bimgList.get(4).replace("'", "")+"',");//pic1
}else{
insertsql.append("'',");//pic1
}
}else{
insertsql.append("'',");//pic1
}
insertsql.append("'0',");//app_source
insertsql.append("'0',");//app_status
insertsql.append("'',");//source_url
insertsql.append("'',");//support
insertsql.append("'',");//os_version_min
if(null != typeInfo&&typeInfo.size()>0){
insertsql.append("'"+typeInfo.get(0).replace("'", "")+"',");//app_tag
}else{
insertsql.append("'',");//app_tag
}
insertsql.append("'',");//single_word
insertsql.append("'18岁',");//age_limit
insertsql.append("'10',");//down_num
insertsql.append("'',");//md5_code
insertsql.append("'1',");//landscapemode
insertsql.append("'');");//video
dao.update(insertsql.toString(),new Object[]{});
TXTParseUtils.method2("D://result.txt", insertsql.toString());

}

private Double getAppSize(String fileLength){
System.out.println(fileLength);
Double size = 0D;
if(fileLength.indexOf("MB")>-1){
String sizeStr = fileLength.replace("MB", "").trim();
size =  Double.parseDouble(sizeStr)*1024*1024; 
}else if(fileLength.indexOf("KB")>-1){
String sizeStr = fileLength.replace("KB", "").trim();
size =  Double.parseDouble(sizeStr)*1024; 
}else if(fileLength.indexOf("GB")>-1){
String sizeStr = fileLength.replace("GB", "").trim();
size =  Double.parseDouble(sizeStr)*1024*1024*1024; 
}
return size;
}



}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值