爬虫学习笔记-WebMagic初识

WebMagic是一款基于Java开源的简单灵活的爬虫框架,使用起来非常简单,与Scrapy爬虫框架很相似,WebMagic的原理与使用官方有详细的说明,这里就不再陈述,直接上实例。

这个WebMagic爬虫实例是结合Spring框架实现的,采用的是基于Redis的调度器,并对爬虫的过程进行了简单的监听。

WebMagic使用说明链接地址: http://webmagic.io/docs/zh/

GitHub项目链接地址: https://github.com/code4craft/webmagic


		
<!-- WebMagic依赖包 -->
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.5.3</version>
</dependency>

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.5.3</version>
</dependency>

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-selenium</artifactId>
    <version>0.5.2</version>
</dependency>

import java.util.ArrayList;
import java.util.List;

import javax.annotation.Resource;
import javax.management.JMException;

import org.platform.crawler.webmagic.modules.abstr.GenericCrawler;
import org.platform.crawler.webmagic.modules.job.pipeline.JobDBPipeline;
import org.platform.crawler.webmagic.scheduler.RedisScheduler;
import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.SpiderListener;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.monitor.SpiderMonitor.MonitorSpiderListener;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;

@Component("jobCrawler")
public class JobCrawler extends GenericCrawler {
	
	@Resource(name = "jobDBPipeline")
	private JobDBPipeline jobDBPipeline = null;
	
	@Resource(name = "redisScheduler")
	private RedisScheduler redisScheduler = null;
	
	public void startCrawl() {
		List<String> initialize_urls = new ArrayList<String>();
		String bj_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";  
		String sh_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=020000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";  
		String gz_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=030200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";  
		String sz_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=040000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";  
		String hz_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=080200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";  
		String cd_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=090200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";  
		String keyword = "%E5%B7%A5%E7%A8%8B%E5%B8%88";  
		initialize_urls.add(bj_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
		initialize_urls.add(sh_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
		initialize_urls.add(gz_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
		initialize_urls.add(sz_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
		initialize_urls.add(hz_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
		initialize_urls.add(cd_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
		List<Pipeline> pipelines = new ArrayList<Pipeline>();
		pipelines.add(new ConsolePipeline());
		pipelines.add(jobDBPipeline);
		Spider jobSpider = Spider.create(new JobPageProcessor())
                .addUrl(initialize_urls.toArray(new String[0]))
                .setScheduler(redisScheduler) 
                .setPipelines(pipelines)
                .thread(4);
		try {
			SpiderMonitor.instance().register(jobSpider);
		} catch (JMException e) {
			e.printStackTrace();
		}
		jobSpider.run();
		List<SpiderListener> spiderListeners = jobSpider.getSpiderListeners();
		for (SpiderListener spiderListener : spiderListeners) {
			if (spiderListener instanceof MonitorSpiderListener) {
				MonitorSpiderListener monitorSpiderListener = (MonitorSpiderListener) spiderListener;
				System.out.println("success count: " + monitorSpiderListener.getSuccessCount());
				System.out.println("error count: " + monitorSpiderListener.getErrorCount());
				System.out.println("error urls: ");
				for (String errorUrl : monitorSpiderListener.getErrorUrls()) {
					System.out.println(errorUrl);
				}
			}
		}
	}

	public static void main(String[] args) {
		new JobCrawler().run();
	}
	
}
可以通过setDownloader(new SeleniumDownloader("chromedriver.exe"))模拟浏览器
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.platform.crawler.webmagic.modules.job.entity.Job;
import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

@Component
public class JobPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10 * 1000);
    
    @Override
    public void process(Page page) {
    	List<String> divs = page.getHtml().xpath("//*[@id='resultList']/div[@class='el']").all();
    	List<Job> jobs = new ArrayList<Job>();
    	Job job = null;
    	for (int i = 1, len = divs.size(); i < len; i++) {
    		Document document = Jsoup.parse(divs.get(i));
    		Elements elements1 = document.select("p.t1 a");
    		Elements elements2 = document.select("span.t2 a");
    		Elements elements3 = document.select("span.t3");
    		Elements elements4 = document.select("span.t4");
    		Elements elements5 = document.select("span.t5");
    		for (int e = 0, elen = elements1.size(); e < elen; e++) {
    			job = new Job();
    			job.setJobName(elements1.get(e).text());
    			job.setJobUrl(elements1.get(e).attr("href"));
    			job.setCompanyName(elements2.get(e).text());
    			job.setWorkplace(elements3.get(e).text());
    			job.setSalary(elements4.get(e).text());
    			job.setPublishDate(elements5.get(e).text());
    			jobs.add(job);
    		}
    	}
    	page.putField("jobs", jobs);
    	String current_url = page.getRequest().getUrl();
    	String regex = "curr_page=\\d+";
    	Matcher matcher = Pattern.compile(regex).matcher(current_url);
    	if (matcher.find()) {
    		String curr_page_kv = matcher.group();
    		if (curr_page_kv.indexOf("=") != -1) {
    			int curr_page = Integer.parseInt(curr_page_kv.split("=")[1]) + 1;
    			page.addTargetRequest(current_url.replace(curr_page_kv, "curr_page=" + curr_page));
    		}
    	}
    }

	@Override
	public Site getSite() {
		return site;
	}
	
}
Site中可以添加Header、Cookie、UserAgent、HttpProxy、Domain等信息
import java.util.List;

import javax.annotation.Resource;

import org.platform.crawler.webmagic.modules.abstr.mapper.GenericMapper;
import org.platform.crawler.webmagic.modules.abstr.pipeline.DBPipeline;
import org.platform.crawler.webmagic.modules.job.entity.Job;
import org.platform.crawler.webmagic.modules.job.mapper.JobMapper;
import org.springframework.stereotype.Component;

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;

@Component("jobDBPipeline")
public class JobDBPipeline extends DBPipeline<Job, Long> {

	@Resource(name = "jobMapper")
	private JobMapper jobMapper = null;
	
	@Override
	public GenericMapper<Job, Long> obtainMapperInstance() {
		return jobMapper;
	}
	
	@Override
	public void process(ResultItems resultItems, Task task) {
		List<Job> jobs = resultItems.get("jobs");
		for (int i = 0, len = jobs.size(); i < len; i++) {
			jobMapper.insert(jobs.get(i));
		}
	}

}





  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值