jeecms 强大的采集功能优化

1:将AcquisitionSvcImpl.java 替换原工程项目com.jeecms.cms.service包下的对应文件。

2:编译工程即可

3:登陆后台配相关规则,如下所示参数:

====================================
*采集名称: 韩寒博客

*页面编码: UTF-8

  动态地址: http://blog.sina.com.cn/s/articlelist_1191258123_0_[page].html

                        页码 从   1  到:  2

内容地址集:   <!-- 列表 START -->.*?<!-- 列表END -->

内容地址: target="_blank" href="(.*?)">(.*?)</a></span>

标题:         <title>(.*?)_韩寒_新浪博客</title>

内容:         <!-- 正文开始 -->(.*?)<!-- 正文结束 -->

========================================================= 

没办法附件上传不了,AcquisitionSvcImpl.java类: 
//---------------------------------------------------------------------------- 
package com.jeecms.cms.service; 

import java.io.IOException; 
import java.net.URI; 
import java.util.ArrayList; 
import java.util.List; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
import org.apache.commons.lang.StringUtils; 
import org.apache.http.HttpEntity; 
import org.apache.http.HttpResponse; 
import org.apache.http.StatusLine; 
import org.apache.http.client.ClientProtocolException; 
import org.apache.http.client.HttpClient; 
import org.apache.http.client.HttpResponseException; 
import org.apache.http.client.ResponseHandler; 
import org.apache.http.client.methods.HttpGet; 
import org.apache.http.impl.client.DefaultHttpClient; 
import org.apache.http.util.EntityUtils; 
import org.slf4j.Logger; 
import org.slf4j.LoggerFactory; 
import org.springframework.beans.factory.annotation.Autowired; 
import org.springframework.stereotype.Service; 
import com.jeecms.cms.entity.assist.CmsAcquisition; 
import com.jeecms.cms.entity.main.Content; 
import com.jeecms.cms.manager.assist.CmsAcquisitionMng; 

@Service 
public class AcquisitionSvcImpl implements AcquisitionSvc { 
private Logger log = LoggerFactory.getLogger(AcquisitionSvcImpl.class); 

public boolean start(Integer id) {  
CmsAcquisition acqu = cmsAcquisitionMng.findById(id); 
if (acqu == null || acqu.getStatus() == CmsAcquisition.START) { 
return false; 
} 
Thread thread = new AcquisitionThread(acqu); 
thread.start(); 
return true; 
} 

private CmsAcquisitionMng cmsAcquisitionMng; 

@Autowired 
public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) { 
this.cmsAcquisitionMng = cmsAcquisitionMng; 
} 

private class AcquisitionThread extends Thread { 
private CmsAcquisition acqu; 

public AcquisitionThread(CmsAcquisition acqu) {  
super(acqu.getClass().getName() + "#" + acqu.getId()); 
this.acqu = acqu; 
}  

@Override 
public void run() { 
if (acqu == null) { 
return; 
} 
acqu = cmsAcquisitionMng.start(acqu.getId()); 
String[] plans = acqu.getAllPlans(); 
HttpClient client = new DefaultHttpClient(); 
CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding()); 
List<String> contentList; 
String url; 
int currNum = acqu.getCurrNum(); 
int currItem = acqu.getCurrItem(); 
Integer acquId = acqu.getId(); 

for (int i = plans.length - currNum; i >= 0; i--)  
{ 
url = plans[i]; 

contentList = getContentList(client, handler, url, acqu.getLinksetStart(), acqu.getLinksetEnd(), acqu.getLinkStart(), acqu.getLinkEnd()); 

String link; 

if(contentList!=null) 
{ 
for (int j = contentList.size() - currItem; j >= 0; j--)  
{ 
if (cmsAcquisitionMng.isNeedBreak(acqu.getId(), plans.length - i, contentList.size() - j, contentList.size()))  
{ 
client.getConnectionManager().shutdown(); 
log.info("Acquisition#{} breaked", acqu.getId()); 
return; 
} 
if (acqu.getPauseTime() > 0)  
{ 
try  
{ 
Thread.sleep(acqu.getPauseTime()); 
}  
catch (InterruptedException e)  
{ 
log.warn("", e); 
} 
} 
link = contentList.get(j); 
saveContent(client, handler, acquId, link, acqu.getTitleStart(), acqu.getTitleEnd(), acqu.getContentStart(), acqu.getContentEnd()); 
} 
} 
currItem = 1; 
} 
client.getConnectionManager().shutdown(); 
cmsAcquisitionMng.end(acqu.getId()); 
log.info("Acquisition#{} complete", acqu.getId()); 
} 


private List<String> getContentList(HttpClient client, 
CharsetHandler handler, String url, String linksetStart, 
String linksetEnd, String linkStart, String linkEnd) { 

List<String> list = new ArrayList<String>(); 

try  
{ 
HttpGet httpget = new HttpGet(new URI(url)); 
String html = client.execute(httpget, handler); 

Pattern pt = Pattern.compile(linksetStart.trim()); 
    Matcher m = pt.matcher(html); 
     
    if(m.find()) 
    { 
     html = m.group(); 
    } 
     
    if(html!=null)  
    { 
     list = getUrlsList(html,linkStart); 
    } 
     
}  
catch (Exception e)   
{ 
log.warn(null, e); 
} 
return list; 
} 

/** 
 * 得到地址集 
 *  
 * @param html 
 * @param linkStart 
 * @return 
 */ 
private List<String> getUrlsList(String html,String linkStart) 
{ 
List<String> list = new ArrayList<String>(); 

     Pattern pt = Pattern.compile(linkStart); 
     
     Matcher m = pt.matcher(html); 
     
     while(m.find()) 
     { 
     String link = m.group(1); 
     
     if(null!=link && !"".equals(link)) 
     { 
     //System.out.println("url : " + link); 
     list.add(link); 
     } 
     } 
     return list; 
} 

private Content saveContent(HttpClient client, CharsetHandler handler, 
Integer acquId, String url, String titleStart, String titleEnd, 
String contentStart, String contentEnd) { 

try { 

HttpGet httpget = new HttpGet(new URI(url)); 
String html = client.execute(httpget, handler); 

String title = ""; 
Pattern pt = Pattern.compile(titleStart.trim()); 
Matcher mt = pt.matcher(html); 

if (mt.find())  
{ 
title = mt.group(1); 
//System.out.println("title : " + title); 
} 

String txt = ""; 
pt = Pattern.compile(contentStart.trim()); 
mt = pt.matcher(html); 
if(mt.find()){ 
txt = mt.group(); 
//System.out.println("txt : " + txt); 
} 

return cmsAcquisitionMng.saveContent(title, txt, acquId); 

}  
catch (Exception e)  
{ 
log.warn(null, e);  
e.printStackTrace(); 
return null; 
} 
} 
} 

private class CharsetHandler implements ResponseHandler<String> { 
private String charset; 

public CharsetHandler(String charset) { 
this.charset = charset; 
} 

public String handleResponse(HttpResponse response) 
throws ClientProtocolException, IOException { 
StatusLine statusLine = response.getStatusLine(); 
if (statusLine.getStatusCode() >= 300) { 
throw new HttpResponseException(statusLine.getStatusCode(), 
statusLine.getReasonPhrase()); 
} 
HttpEntity entity = response.getEntity(); 
if (entity != null) { 
if (!StringUtils.isBlank(charset)) { 
return EntityUtils.toString(entity, charset); 
} else { 
return EntityUtils.toString(entity); 
} 
} else { 
return null; 
} 
} 
} 
} 
//-------------------------------------------------------------------------------- 


开始一个采集看看,你会发现世界是如此的美好,你可以让你的项目自己采集你想要的东东!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值