设计思路
- Maven 引用 org.quartz-scheduler,org.jsoup
- 把资源填入数据表中 假设:resource表,
- 控制资源抓取的任务调度表:job_manager
- 建立独立运行的项目,依托 Maven 独立部署
- 项目启动,每【x时常】 ,去资源表,找特定状态资源,如果存在,更新当前资源状态为运行中,然后爬取,爬取完成后标记资源结束。
关键代码
public void afterJFinalStart() {
// TODO Auto-generated method stub
super.afterJFinalStart();
QuartzManagerKit qm = new QuartzManagerKit();
qm.initJob();
}
public void beforeJFinalStop() {
super.beforeJFinalStop();
//关闭定时任务管理器
QuartzManagerKit qm = new QuartzManagerKit();
qm.shutdown();
}
public class DynamicJob3 implements Job{
private static final ResourceService srv=ResourceService.me;
private static final ArticleService srv_article=ArticleService.me;
public void execute(JobExecutionContext context) throws JobExecutionException {
SpiderResource sr=srv.getone();
if(sr!=null&&sr.getStatus()==0){
srv.updatestatus(sr.getId(), 1);
webCrawler(sr);
srv.updatestatus(sr.getId(), 2);
}
}
后边:QuartzManagerKit.java 代码
具体代码
/**
*
*/
package cn.jdou.spider;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import com.jfinal.kit.JsonKit;
import com.jfinal.kit.PropKit;
import cn.jdou.common.model.SpiderArticle;
import cn.jdou.common.model.SpiderResource;
import cn.jdou.spider.article.ArticleService;
import cn.jdou.spider.resource.ResourceService;
/***类描述:
*@author: raifei
*@date: 日期:2018年8月20日 时间:下午5:14:26
*@version 1.0
*/
public class DynamicJob3 implements Job{
private static final ResourceService srv=ResourceService.me;
private static final ArticleService srv_article=ArticleService.me;
public void execute(JobExecutionContext context) throws JobExecutionException {
SpiderResource sr=srv.getone();
if(sr!=null&&sr.getStatus()==0){
srv.updatestatus(sr.getId(), 1);
webCrawler(sr);
srv.updatestatus(sr.getId(), 2);
}
}
public static void webCrawler(SpiderResource sr) {
try {
//获取所有链接
for(int i=sr.getPagestart();i<=sr.getPageend();i=i+sr.getStepnum()){
System.out.print(i);
String url_=sr.getUrl().replace("{page}", String.valueOf(i));
webCrawler(sr,url_);
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void webCrawler(SpiderResource sr,String url) throws IOException {
Document document = Jsoup.connect(url)
//需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
//(这种做法可能也会被某些网站拉黑IP一段时间,由于不太稳定到底是不是代码的问题,还在测试中...)
.userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
//加上cookie信息
.cookie("auth", "token")
//设置超时
.timeout(30000)
//用get()方式请求网址,也可以post()方式
.get();
//此处可以文档处理
// document = Jsoup.parse(document.toString());
//获取列表
Elements elements = document.select(sr.getRegions());
//获取列表循环元素
for (Element tr : elements.select(sr.getList())) {
//循环元素二次筛选,筛选到 a标签
Elements tds = tr.select(sr.getAhref());
//获取a标签,跳转抓取详情
String href = tds.attr("abs:href");
//提前捕获标题嘛
if(sr.getIscatchtittle()==1){
String title=tr.select(sr.getTitle()).text();
}
//
//System.out.print(href);
List<String> imglist=new ArrayList<String>();
//查找缩略图,查找图片
if(sr.getIscatchimg()==1){
Elements imgs=tr.select("img");
for (Element element : imgs) {
//获取每个img标签URL "abs:"表示绝对路径
String imgSrc = element.attr("abs:src");
//String affix_name=imgSrc;
String fileExt = imgSrc.substring(imgSrc.lastIndexOf(".") + 1).toLowerCase();
String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
// 打印URL
System.out.println(imgSrc);
imglist.add(imgpath);
//下载图片到本地
downImages(PropKit.get("fileservice"), imgSrc,imgpath);
}
webCrawler(href,sr,imglist);
System.out.print(imgs);
}
}
}
public static void webCrawler(String uri,SpiderResource sr,List<String> imgList) {
try {
//获取整个页面文件
Document document = Jsoup.connect(uri)
//需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
//(这种做法可能也会被某些网站拉黑IP一段时间,由于不太稳定到底是不是代码的问题,还在测试中...)
.userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
//加上cookie信息
.cookie("auth", "token")
//设置超时
.timeout(30000)
//用get()方式请求网址,也可以post()方式
.get();
//此处可以文档处理
// document = Jsoup.parse(document.toString());
//获取文章标题
String source=uri;//文章来源
String title= document.select(sr.getTitle()).text();//文章标题
String author="";//获取来源
if(!"".equals(sr.getAuthor())){
author=document.select(sr.getAuthor()).text();
}
String details="";
//获取详情
Elements elements = document.select(sr.getDetails());
//抓取图片并替换链接
Elements imgs = elements.select("img[src]");
for (Element img : imgs){
String affix_name=img.attr("abs:src");
String fileExt = affix_name.substring(affix_name.lastIndexOf(".") + 1).toLowerCase();
String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
img.attr("src",PropKit.get("fileservice")+"//"+imgpath);
//下载图像
downImages(PropKit.get("fileservice"),affix_name,imgpath);
}
details=elements.html();
SpiderArticle sa=new SpiderArticle().setArticlename(title).setCategoryid(sr.getId()).setSource(source).setArticledetails(details).setLitimg(JsonKit.toJson(imgList));
srv_article.save(sa);
//System.out.print(details);
//获取列表循环元素
} catch (IOException e) {
e.printStackTrace();
}
}
public static void downImages(String filePath, String imgUrl,String newname) {
// 若指定文件夹没有,则先创建
File dir = new File(filePath);
if (!dir.exists()) {
dir.mkdirs();
}
// 截取图片文件名
String fileName =imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length());
try {
// 文件名里面可能有中文或者空格,所以这里要进行处理。但空格又会被URLEncoder转义为加号
String urlTail = URLEncoder.encode(fileName, "UTF-8");
// 因此要将加号转化为UTF-8格式的%20
imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
// 写出的路径
File file = new File(filePath + File.separator + ("".equals(newname)?fileName:newname));
try {
// 获取图片URL
URL url = new URL(imgUrl);
// 获得连接
URLConnection connection = url.openConnection();
// 设置10秒的相应时间
connection.setConnectTimeout(10 * 1000);
// 获得输入流
InputStream in = connection.getInputStream();
// 获得输出流
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
// 构建缓冲区
byte[] buf = new byte[1024];
int size;
// 写入到文件
while (-1 != (size = in.read(buf))) {
out.write(buf, 0, size);
}
out.close();
in.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
QuartzManagerKit.java
package cn.jdou.common.Kit;
//package top.rushpeak.edu03.admin.util;
import static org.quartz.CronScheduleBuilder.cronSchedule;
import static org.quartz.JobBuilder.newJob;
import static org.quartz.TriggerBuilder.newTrigger;
import java.util.List;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.quartz.Job;
import org.quartz.JobDetail;
import org.quartz.JobKey;
import org.quartz.Scheduler;
import org.quartz.SchedulerException;
import org.quartz.Trigger;
import org.quartz.TriggerKey;
import org.quartz.impl.StdSchedulerFactory;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.Record;
//import top.rushpeak.edu03.admin.job.DynamicJob;//这个是后面的任务实现类
public class QuartzManagerKit {
private Logger log = LogManager.getLogger(QuartzManagerKit.class);
private Scheduler scheduler = null;
public QuartzManagerKit() {
try {
scheduler = new StdSchedulerFactory().getScheduler();
log.info("初始化调度器 ");
} catch (SchedulerException ex) {
log.error("初始化调度器=> [失败]:" + ex.getLocalizedMessage());
}
}
//初始化启动任务
public void initJob(){
List<Record> jobs = Db.find("SELECT * FROM job_manager WHERE 1=1 AND is_enabled = 'Y'");
for(Record job:jobs){
if("Y".equals(job.getStr("is_enabled"))){
String className = job.getStr("clazz");
Class<? extends Job> jobClazz = null;
try {
jobClazz = Class.forName(className).asSubclass(Job.class);
} catch (Exception e) {
System.out.println(className+"没有继承job,e=="+e);
log.error(className+"没有继承job,e=="+e);
continue;
}
String name = job.getStr("name");
String group = job.getStr("group");
String cronExpression = job.getStr("cron_expression");
this.addJob(name, group, jobClazz, cronExpression);
}
}
this.start();
}
//添加任务
public void addJob(String name, String group, Class<? extends Job> clazz, String cronExpression) {
try {
// 构造任务
JobDetail job = newJob(clazz).withIdentity(name, group).build();
// 构造任务触发器
Trigger trg = newTrigger().withIdentity(name, group).withSchedule(cronSchedule(cronExpression)).build();
// 将作业添加到调度器
scheduler.scheduleJob(job, trg);
log.info("创建作业=> [作业名称:" + name + " 作业组:" + group + "] ");
System.out.println("创建作业=> [作业名称:" + name + " 作业组:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("创建作业=> [作业名称:" + name + " 作业组:" + group + "]=> [失败]");
}
}
//移除任务
public void removeJob(String name, String group) {
try {
TriggerKey tk = TriggerKey.triggerKey(name, group);
scheduler.pauseTrigger(tk);// 停止触发器
scheduler.unscheduleJob(tk);// 移除触发器
JobKey jobKey = JobKey.jobKey(name, group);
scheduler.deleteJob(jobKey);// 删除作业
log.info("删除作业=> [作业名称:" + name + " 作业组:" + group + "] ");
System.out.println("删除作业=> [作业名称:" + name + " 作业组:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("删除作业=> [作业名称:" + name + " 作业组:" + group + "]=> [失败]");
}
}
public void pauseJob(String name, String group) {
try {
JobKey jobKey = JobKey.jobKey(name, group);
scheduler.pauseJob(jobKey);
log.info("暂停作业=> [作业名称:" + name + " 作业组:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("暂停作业=> [作业名称:" + name + " 作业组:" + group + "]=> [失败]");
}
}
public void resumeJob(String name, String group) {
try {
JobKey jobKey = JobKey.jobKey(name, group);
scheduler.resumeJob(jobKey);
log.info("恢复作业=> [作业名称:" + name + " 作业组:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("恢复作业=> [作业名称:" + name + " 作业组:" + group + "]=> [失败]");
}
}
public void modifyTime(String name, String group, String cronExpression) {
try {
TriggerKey tk = TriggerKey.triggerKey(name, group);
// 构造任务触发器
Trigger trg = newTrigger()
.withIdentity(name, group)
.withSchedule(cronSchedule(cronExpression))
.build();
scheduler.rescheduleJob(tk, trg);
log.info("修改作业触发时间=> [作业名称:" + name + " 作业组:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("修改作业触发时间=> [作业名称:" + name + " 作业组:" + group + "]=> [失败]");
}
}
public void start() {
try {
scheduler.start();
log.info("启动调度器 ");
System.out.println("启动调度器 ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("启动调度器=> [失败]");
}
}
public void shutdown() {
try {
scheduler.shutdown();
log.info("停止调度器 ");
System.out.println("停止调度器 ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("停止调度器=> [失败]");
}
}
}