webmagic 练习之保存使用上一级信息

最新推荐文章于 2022-02-09 20:02:53 发布

weixin_34112900

最新推荐文章于 2022-02-09 20:02:53 发布

阅读量105

点赞数

文章标签： python 数据库 java

原文链接：https://my.oschina.net/jianqiangxing/blog/213774

版权

2019独角兽企业重金招聘Python工程师标准>>>

前几天看了黄亿华先生的文章《玩转webmagic代码之Scheduler》感觉很有用，因为我们在抓取信息时往往需要保存上一级页面的信息。比如我最近的毕业设计要去爬一些导航网站，爬首页时要保存下来大类（视频）然后进入视频的子页面抓取子类（电影，电视剧，动漫。。。。）最终把他们格式化为（url，大类，子类）存到数据库以便动态生成页面时取数据。

想要拿他文章中的例子试试手，结果发现好像有点错误，就自己捣鼓了一下，最终还是有点小问题（直辖市的没法儿搞，页面不统一，写了处理方法却不知为何不奏效遂放弃）

下边是代码：

import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.PriorityScheduler;

import java.util.List;

import static us.codecraft.webmagic.selector.Selectors.regex;
import static us.codecraft.webmagic.selector.Selectors.xpath;

/**
 * @author code4crafter@gmail.com
 */
public class answerSpider implements PageProcessor {

    private Site site = Site.me().setCharset("gb2312")
            .setSleepTime(100).addStartUrl("http://www.ip138.com/post/");

    @Override
    public void process(Page page) {
    	//这还真是深度优先啊
        if (page.getUrl().toString().equals("http://www.ip138.com/post/")) {
        	System.out.println("进入第一层网页\n\n");
            processCountry(page);
        } else if (page.getUrl().regex("http://www\\.ip138\\.com/\\w+[/]?$").toString() != null) { //都是绝对路径
        	System.out.println("进入第二层网页\n\n");
            processProvince(page);
        } else {
        	System.out.println("进入第三层网页\n\n");
            processDistrict(page);
        }

    }

    private void processCountry(Page page) {
        List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
        for (String province : provinces) {
            String link = xpath("//@href").select(province);
            String title = xpath("/text()").select(province);
            if(link==null)
            	 continue;
            //System.out.println(title+": "+link+"\n\n");
            Request request = new Request(link).setPriority(0).putExtra("province", title);
            page.addTargetRequest(request);
        }
    }

    private void processProvince(Page page) {
        //这里仅靠xpath没法精准定位，所以使用正则作为筛选，不符合正则的会被过滤掉
        List<String> districts = page.getHtml().xpath("//body/table[@class='t12']/tbody/tr/td").regex(".*http://www\\.ip138\\.com/\\w+/\\w+.*").all();
         for (String district : districts) {
              String link = xpath("//@href").select(district);
              String title = xpath("/text()").select(district);
              // System.out.println(title+": "+link+"\n\n");
              Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
              page.addTargetRequest(request);
          }
      
    }

    private void processDistrict(Page page) { 
        
    	String province = page.getRequest().getExtra("province").toString();
        String district = page.getRequest().getExtra("district").toString();
        
        String re0="<td>.*([\\u4e00-\\u9fa5]+).*</td>"; //匹配城市   海南省的比较奇怪
        String re1="<td><a href=\"http://www\\.ip138\\.com/[0-9]{6}/\">([0-9]{6})</a></td>";//匹配邮编
        String re2="<td><a href=\"http://www\\.ip138\\.com/[0-9]{4}/\">([0-9]{4})</a></td>";//匹配区号
        List<String> counties = page.getHtml().xpath("//body/table[@class='t12']/tbody/tr").regex(re0).all();
        List<String> youbians = page.getHtml().xpath("//body/table[@class='t12']/tbody/tr").regex(re1).all();
        List<String> quhaos = page.getHtml().xpath("//body/table[@class='t12']/tbody/tr").regex(re2).all();
        
        int i=counties.size();
        for(int j=0;j<i;j++){
           System.out.println(province+" "+district+" "+counties.get(j)+" 邮编："+youbians.get(j)+" 区号："+quhaos.get(j));
        }
       
        /*
        int i=counties.size();
        for(int j=0;j<i;j++){
            page.putField("province", province);
            page.putField("district",district);
            page.putField("counties",counties.get(j));
            page.putField("youbians",youbians.get(j));
            page.putField("quhaos",quhaos.get(j));
          */
        }
        
    }
    

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
    	FilePipeline fp=new FilePipeline("c:\\data.txt");
        Spider.create(new answerSpider()).scheduler(new PriorityScheduler()).pipeline(fp).run();

    }
}

贴出来一是以后自己看，二是让各位大牛帮忙指导一下

另外貌似保存到文件时会出现一点小问题，而且是每个页面一个文件，感觉不太好，所以就自己给他打印到控制题啊了

转载于:https://my.oschina.net/jianqiangxing/blog/213774