jeecms 采集功能优化,基于htmlparser实现,多线程版

为了熟悉一下多线程相关知识,把jeecms采集器类,改成了多线程版,还不是很完善,帖出来大家一起完善,改进。
说明:暂不支持暂停,停止功能。
用法:和我上一篇jeecms 采集功能优化,基于htmlparser实现里面的用法一样。
思路:想法很简单,在主线程处理类中,先取得当前采集任务下所有URL,并放入队列中,然后开启指定数目的线程(默认是2)采集内容


代码清单:
采集器主类:MultiThreadAcquisitionSvcImpl.java
HTML解析工具类接口:ParseHtmlTool.java
HTML解析工具,HtmlParser实现类:HtmlParserImpl.java
采集参数封装bean:ParamBean.java
队列类:Queue.java
URL队列:UrlQueue.java

代码如下:


采集器主类:MultiThreadAcquisitionSvcImpl.java

package com.jeecms.cms.service;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.jeecms.cms.entity.assist.CmsAcquisition;
import com.jeecms.cms.entity.main.Content;
import com.jeecms.cms.manager.assist.CmsAcquisitionMng;
/**
* 采集器-多线程版
* @author javacoo
* @since 2011-11-02
* @version 1.0
*/
@Service
public class MultiThreadAcquisitionSvcImpl implements AcquisitionSvc {
private Logger log = LoggerFactory.getLogger(MultiThreadAcquisitionSvcImpl.class);
/**开启线程数*/
private static int THREAD_NUM = 2;
/**每个线程休眠毫秒数*/
private static int SLEEP_TIME = 100;
/**连接集合标志*/
private static String LINK_KEY = "linkKey";
/**标题集合标志*/
private static String TITLE_KEY = "titleKey";
/**采集管理对象*/
private CmsAcquisitionMng cmsAcquisitionMng;
/**存放HttpClient的ThreadLocal对象*/
private static ThreadLocal<HttpClient> httpClientThreadLocal = new ThreadLocal<HttpClient>();
/**存放ParseHtmlTool的ThreadLocal对象*/
private static ThreadLocal<ParseHtmlTool> parseHtmlToolThreadLocal = new ThreadLocal<ParseHtmlTool>();
/**存放UrlQueue的ThreadLocal对象*/
private static ThreadLocal<UrlQueue> urlQueueThreadLocal = new ThreadLocal<UrlQueue>();

@Autowired
public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) {
this.cmsAcquisitionMng = cmsAcquisitionMng;
}

/**
* 开始执行采集任务
*/
public boolean start(Integer id) {
CmsAcquisition acqu = cmsAcquisitionMng.findById(id);
if (acqu == null || acqu.getStatus() == CmsAcquisition.START) {
return false;
}
new Thread(new MainThreadProcesser(this,acqu)).start();
return true;
}
/**
* 主线程处理类
* @author javacoo
* @since 2011-11-02
*/
private class MainThreadProcesser implements Runnable {
private CmsAcquisition acqu;
private AcquisitionSvc acquisitionSvc;
public MainThreadProcesser(AcquisitionSvc acquisitionSvc,CmsAcquisition acqu) {
this.acqu = acqu;
this.acquisitionSvc = acquisitionSvc;
}
public void run() {
long tStart = System.currentTimeMillis();
System.out.println("主线程:"+Thread.currentThread().getName() + "开始...");
try {
getHttpClient().getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new HttpHost("128.160.64.5", 1235));
CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding());
getAllUrls(acqu,handler);
CountDownLatch latch = new CountDownLatch(THREAD_NUM);
ExecutorService exec = Executors.newCachedThreadPool();
for(int i=0;i<THREAD_NUM;i++){
Thread thread = new Thread(new Processer(acquisitionSvc,acqu,latch,getHttpClient(),getUrlQueue(),getParseHtmlTool(acqu),handler));
exec.execute(thread);
}
latch.await();
exec.shutdown();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
httpClientThreadLocal.get().getConnectionManager().shutdown();
cmsAcquisitionMng.end(acqu.getId());
httpClientThreadLocal.remove();
parseHtmlToolThreadLocal.remove();
urlQueueThreadLocal.remove();

long tEnd = System.currentTimeMillis();
System.out.println("主线程:"+Thread.currentThread().getName() + "结束...");
System.out.println("主线程:"+Thread.currentThread().getName() + "总共用时:" + (tEnd - tStart) + "ms");
}
}
}
/**
* 处理类
* @author javacoo
* @since 2011-11-02
*/
private class Processer implements Runnable {
private AcquisitionSvc acquisitionSvc;
private CmsAcquisition acqu;
private CountDownLatch latch;
private UrlQueue urlQueue;
private HttpClient httpClient;
private ParseHtmlTool parseHtmlTool;
private CharsetHandler handler;
public Processer(AcquisitionSvc acquisitionSvc,CmsAcquisition acqu,CountDownLatch latch,HttpClient httpClient,UrlQueue urlQueue,ParseHtmlTool parseHtmlTool,CharsetHandler handler) {
this.acquisitionSvc = acquisitionSvc;
this.acqu = acqu;
this.latch = latch;
this.urlQueue = urlQueue;
this.httpClient = httpClient;
this.parseHtmlTool = parseHtmlTool;
this.handler = handler;
}
public void run() {
System.out.println("======================子线程:"+Thread.currentThread().getName() + "开始...");
try {
Map<String,String> urlMap = null;
while(!urlAndTitleMapIsEmpty(urlQueue)) {
urlMap = getUrlAndTitleMap(urlQueue);
saveContent(acqu,httpClient,parseHtmlTool,handler,urlMap);
Thread.sleep(SLEEP_TIME);
}
} catch (Exception e) {
e.printStackTrace();
log.warn(null, e);
} finally {
System.out.println("======================子线程:"+Thread.currentThread().getName() + "结束.");
log.info("Acquisition#{} complete", acqu.getId());
latch.countDown();
}
}
}


/**
* 取得当前主线程的HttpClient对象
* @return 当前主线程的HttpClient对象
*/
private static HttpClient getHttpClient(){
if(httpClientThreadLocal.get() == null){
HttpClient client = new DefaultHttpClient();
httpClientThreadLocal.set(client);
return client;
}else{
return httpClientThreadLocal.get();
}
}
/**
* 取得当前主线程的UrlQueue对象
* @return 当前主线程的UrlQueue对象
*/
private static UrlQueue getUrlQueue(){
if(urlQueueThreadLocal.get() == null){
UrlQueue urlQueue = new UrlQueue();
urlQueueThreadLocal.set(urlQueue);
return urlQueue;
}else{
return urlQueueThreadLocal.get();
}
}
/**
* 取得当前主线程的ParseHtmlTool对象
* @param acqu 采集参数对象
* @return 当前主线程的ParseHtmlTool对象
*/
private static ParseHtmlTool getParseHtmlTool(CmsAcquisition acqu){
if(parseHtmlToolThreadLocal.get() == null){
ParseHtmlTool parseHtmlTool = new HtmlParserImpl(acqu);
parseHtmlToolThreadLocal.set(parseHtmlTool);
return parseHtmlTool;
}else{
return parseHtmlToolThreadLocal.get();
}
}
/**
* 连接和标题map对象入队列
* @param map 连接和标题map对象
*/
private synchronized void addUrlAndTitleMap(Map<String,String> map){
getUrlQueue().addUnVisitedUrl(map);
}
/**
* 连接和标题map对象出队列
* @param urlQueue 当前线程的队列
* @return 连接和标题map对象
*/
private synchronized Map<String,String> getUrlAndTitleMap(UrlQueue urlQueue){
return urlQueue.unVisitedUrlDeQueue();
}
/**
* 判断当前对象是否为空
* @param urlQueue 当前线程的队列
* @return true/flase
*/
private synchronized boolean urlAndTitleMapIsEmpty(UrlQueue urlQueue){
return urlQueue.isEmpty();
}
/**
* 取得当前线程下所有计划的连接,并加入队列
* @param acqu 采集参数对象
* @param handler 字符集对象
* @throws URISyntaxException
* @throws IOException
* @throws ClientProtocolException
*/
private void getAllUrls(CmsAcquisition acqu,CharsetHandler handler) throws URISyntaxException, ClientProtocolException, IOException{
acqu = cmsAcquisitionMng.start(acqu.getId());
String[] plans = acqu.getAllPlans();
String url = null;
String html = null;
List<Map<String,String>> urlAndTitleListMap = null;
HttpGet httpGet = null;
for (int i = plans.length - acqu.getCurrNum(); i >= 0; i--) {
url = plans[i];
httpGet = new HttpGet(new URI(url.trim()));
html = getHttpClient().execute(httpGet, handler);
urlAndTitleListMap = getParseHtmlTool(acqu).getUrlAndTitleMap(html);
for(Map<String,String> map : urlAndTitleListMap){
addUrlAndTitleMap(map);
}
}
System.out.println("=======当前线程:"+Thread.currentThread().getName() + "URL连接数:"+getUrlQueue().getUnVisitedUrl().getSize());
}
/**
* 保存内容
* @param acqu 请求参数对象
* @param httpClient httpClient对象
* @param parseHtmlTool parseHtmlTool对象
* @param handler CharsetHandler对象
* @param map 连接和标题map对象
* @return Content
*/
private synchronized Content saveContent(CmsAcquisition acqu,HttpClient httpClient,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Map<String,String> map) {
try {
HttpGet httpGet = null;
if(map.get(LINK_KEY).contains("http://")){
httpGet = new HttpGet(new URI(map.get(LINK_KEY).trim()));
}else{
httpGet = new HttpGet(new URI("http://localhost/v7/"+map.get(LINK_KEY).trim()));
}
String html = httpClient.execute(httpGet, handler);
System.out.println("=============================子线程:"+Thread.currentThread().getName() + "执行");
String txt = parseHtmlTool.getHtml(html);
return cmsAcquisitionMng.saveContent(map.get(TITLE_KEY), txt,acqu.getId());
//return null;
} catch (Exception e) {
log.warn(null, e);
e.printStackTrace();
return null;
}
}
/**
* 字符集帮助类
* @author Administrator
*
*/
private class CharsetHandler implements ResponseHandler<String> {
private String charset;

public CharsetHandler(String charset) {
this.charset = charset;
}

public String handleResponse(HttpResponse response)
throws ClientProtocolException, IOException {
StatusLine statusLine = response.getStatusLine();
if (statusLine.getStatusCode() >= 300) {
throw new HttpResponseException(statusLine.getStatusCode(),
statusLine.getReasonPhrase());
}
HttpEntity entity = response.getEntity();
if (entity != null) {
if (!StringUtils.isBlank(charset)) {
return EntityUtils.toString(entity, charset);
} else {
return EntityUtils.toString(entity);
}
} else {
return null;
}
}
}


}

相关辅助类

HTML解析工具类接口:ParseHtmlTool.java

package com.jeecms.cms.service;

import java.util.List;
import java.util.Map;

/**
* HTML解析工具类接口
* @author javacoo
* @since 2011-10-31
*/
public interface ParseHtmlTool {
/**
* 取得连接集合
* @param orginHtml 原始HTML
* @return 连接集合
*/
List<String> getUrlList( String orginHtml);
/**
* 取得标题集合
* @param orginHtml 原始HTML
* @return 标题集合
*/
List<String> getTitleList(String orginHtml);
/**
* 取得指定区域的HTML内容
* @return 指定区域的HTML内容
*/
String getHtml(String orginHtml);
/**
* 取得连接标题Map集合
* @param orginHtml 原始HTML
* @return 连接标题Map集合
*/
List<Map<String,String>> getUrlAndTitleMap(String orginHtml);
}

HTML解析工具,HtmlParser实现类:HtmlParserImpl.java

package com.jeecms.cms.service;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.jeecms.cms.entity.assist.CmsAcquisition;
/**
* HTML解析工具,HtmlParser实现类
* @author javacoo
* @since 2011-10-31
*/
public class HtmlParserImpl implements ParseHtmlTool{
/**连接集合标志*/
private static String LINK_KEY = "linkKey";
/**标题集合标志*/
private static String TITLE_KEY = "titleKey";
/**单标签标志*/
private static String SINGLE_TAG = "singleTag";
/**连接正则表达式*/
private static String LINK_REGX = "<a.*href=\"(.*?)\".*>(.*?)</a>";
/**正则表达式对象*/
private Pattern pt = Pattern.compile(LINK_REGX);
/**采集参数bean*/
private ParamBean paramBean;

public HtmlParserImpl(CmsAcquisition acqu){
parseRequestParam(acqu);
}

/**
* 取得标题集合
* @param orginHtml 原始HTML
* @return 标题集合
*/
public List<String> getTitleList(String orginHtml) {
orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);
if (StringUtils.isNotEmpty(orginHtml)) {
return getUrlOrTitleListByType(orginHtml,TITLE_KEY);
}
return null;
}

/**
* 取得连接集合
* @param orginHtml 原始HTML
* @return 连接集合
*/
public List<String> getUrlList(String orginHtml) {
orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);
if (StringUtils.isNotEmpty(orginHtml)) {
return getUrlOrTitleListByType(orginHtml,LINK_KEY);
}
return null;
}
/**
* 取得指定区域的HTML内容
* @param orginHtml 原始HTML
* @return 指定区域的HTML内容
* @throws ParserException
*/
public String getHtml(String orginHtml) {
orginHtml = getHtmlByFilter(paramBean.getContentStartMap(), paramBean.getContentEndMap(),orginHtml);
return orginHtml;
}
/**
* 取得连接标题Map
* @param orginHtml 原始HTML
* @return 连接标题Map
*/
public List<Map<String,String>> getUrlAndTitleMap(String orginHtml){
return getUrlAandTitleMap(orginHtml);
}
/**
* 解析采集参数,并封装到ParamBean
* @param acqu 原始采集参数
* @return 采集参数封装bean
*/
private void parseRequestParam(CmsAcquisition acqu){
paramBean = new ParamBean();
if(!StringUtils.isEmpty(acqu.getLinksetStart())){
paramBean.setLinksetStartMap(populateParamMap(acqu.getLinksetStart()));
}
if(!StringUtils.isEmpty(acqu.getLinksetEnd())){
paramBean.setLinksetEndMap(populateParamMap(acqu.getLinksetEnd()));
}
if(!StringUtils.isEmpty(acqu.getContentStart())){
paramBean.setContentStartMap(populateParamMap(acqu.getContentStart()));
}
if(!StringUtils.isEmpty(acqu.getContentEnd())){
paramBean.setContentEndMap(populateParamMap(acqu.getContentEnd()));
}
}
/**
* 得到连接标题MAP
* @param html html内容
* @return 连接或者标题集合
*/
private List<Map<String,String>> getUrlAandTitleMap(String html) {
html = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),html);
List<Map<String,String>> resultMapList = new ArrayList<Map<String,String>>();
Map<String,String> resultMap = null;
Matcher m = pt.matcher(html);
while (m.find()) {
if(StringUtils.isNotEmpty(m.group(1)) && StringUtils.isNotEmpty(m.group(2))){
resultMap = new HashMap<String, String>();
resultMap.put(LINK_KEY, m.group(1));
resultMap.put(TITLE_KEY, m.group(2));
resultMapList.add(resultMap);
}
}
return resultMapList;
}
/**
* 得到地址集
* @param html html内容
* @param type 1 :取得连接集合,2:取得标题集合
* @return 连接或者标题集合
*/
private List<String> getUrlOrTitleListByType(String html, String type) {
List<String> resultList = new ArrayList<String>();
Matcher m = pt.matcher(html);
String result = "";
int pos = 1;
if(TITLE_KEY.equals(type)){
pos = 2;
}
while (m.find()) {
result = m.group(pos);
resultList.add(result);
}
return resultList;
}
/**
* 取得指定区域的HTML内容
* @param tagMap 标签MAP
* @param removeTagMap 要过滤的标签MAP
* @param orginHtml 原始HTML
* @return 指定区域的HTML内容
* @throws ParserException
*/
private String getHtmlByFilter(Map<String, String> tagMap,
Map<String, String> removeTagMap, String orginHtml) {
try {
Parser parser = new Parser();
parser.setInputHTML(orginHtml);
// 第一步取得指定属性/标签内容
String tempKey = null;
String tempValue = null;
String[] tempValueArr = null;
StringBuilder sb = new StringBuilder();
NodeFilter filter = null;
for(Iterator<String> it = tagMap.keySet().iterator(); it.hasNext();){
tempKey = it.next();
tempValue = tagMap.get(tempKey);
if(tempValue.contains("|")){
tempValueArr = tempValue.split("\\|");
}else{
tempValueArr = new String[]{tempValue};
}
for(String value : tempValueArr){
filter = populateFilter(tempKey,value);
appendHtmlByFilter(parser, filter, sb);
}
}
// 第二步过滤指定属性/标签内容
String contentHtml = sb.toString();
for (Iterator<String> it = removeTagMap.keySet().iterator(); it
.hasNext();) {
tempKey = it.next();
tempValue = removeTagMap.get(tempKey);
if(tempValue.contains("|")){
tempValueArr = tempValue.split("\\|");
}else{
tempValueArr = new String[]{tempValue};
}
for(String value : tempValueArr){
filter = populateFilter(tempKey,value);
contentHtml = removeHtmlByFilter(parser, filter, contentHtml);
}
}
//第三步过滤注释
filter = new NodeClassFilter(RemarkNode.class);
contentHtml = removeHtmlByFilter(parser, filter, contentHtml);
//System.out.println("=================================结果=======================================");
//System.out.println(contentHtml);
return contentHtml;
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "";
}

/**
* 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
* [*]约定采集参数格式如下

* [*]1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN

* [*]2,标签名称形式,如:div,p,span

* [*]3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span

* @param paramStr 参数字符串
*/
private Map<String, String> populateParamMap(String paramStr) {
Map<String, String> paramMap = new HashMap<String, String>();
String[] paramStrArr = paramStr.split(",");
String[] tempStrArr = null;
StringBuilder sb = new StringBuilder();
for(String temp : paramStrArr){
if(temp.contains("=")){
tempStrArr = temp.split("=");
paramMap.put(tempStrArr[0], tempStrArr[1]);
}else{
if(StringUtils.isNotEmpty(temp)){
sb.append(temp).append("|");
}
}
}
if(StringUtils.isNotEmpty(sb.toString())){
paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));
}
return paramMap;
}
/**
* 组装过滤器
* @param key 键
* @param value 值
* @return 过滤器
*/
private NodeFilter populateFilter(String key,String value) {
NodeFilter filter;
if(SINGLE_TAG.equals(key)){
filter = new TagNameFilter(value);
}else{
filter = new HasAttributeFilter(key,value);
}
return filter;
}
/**
* 过滤指定属性标签HTML
* @param parser 解析器
* @param filter 属性过滤器
* @param orginHtml 原始HTML
* @return 过滤后HTML
* @throws ParserException
*/
private String removeHtmlByFilter(Parser parser, NodeFilter filter,String orginHtml) throws ParserException {
parser.setInputHTML(orginHtml);
NodeList nodes = parser.extractAllNodesThatMatch(filter);
for (int i = 0; i < nodes.size(); i++) {
Node textnode = (Node) nodes.elementAt(i);
orginHtml = StringUtils.remove(orginHtml, textnode.toHtml());
}
return orginHtml;
}
/**
* 取得所有指定属性/标签的HTML
* @param parser 解析器
* @param filter 过滤器
* @param sb
* @throws ParserException
*/
private void appendHtmlByFilter(Parser parser, NodeFilter filter,
StringBuilder sb) throws ParserException {
NodeList nodes = parser.extractAllNodesThatMatch(filter);
for (int i = 0; i < nodes.size(); i++) {
Node textnode = (Node) nodes.elementAt(i);
sb.append(textnode.toHtml());
}
}

/**
* 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
* [*]约定采集参数格式如下

* [*]1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN

* [*]2,标签名称形式,如:div,p,span

* [*]3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span

* @param paramMap 参数map
* @param str 参数字符串
*/
private void populateParamMap(Map<String, String> paramMap,String paramStr) {
String[] paramStrArr = paramStr.split(",");
String[] tempStrArr = null;
StringBuilder sb = new StringBuilder();
for(String temp : paramStrArr){
if(temp.contains("=")){
tempStrArr = temp.split("=");
paramMap.put(tempStrArr[0], tempStrArr[1]);
}else{
if(StringUtils.isNotEmpty(temp)){
sb.append(temp).append("|");
}
}
}
if(StringUtils.isNotEmpty(sb.toString())){
paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));
}
}

/**
* 测试方法-打开文件并返回内容
* @param szFileName 文件绝对地址
* @param charset 字符集
* @return 内容
*/
public static String openFile(String szFileName,String charset) {
try {
BufferedReader bis = new BufferedReader(new InputStreamReader(
new FileInputStream(new File(szFileName)), charset));
StringBuilder szContent = new StringBuilder();
String szTemp;

while ((szTemp = bis.readLine()) != null) {
szContent.append(szTemp).append("\n");
}
bis.close();
return szContent.toString();
} catch (Exception e) {
return "";
}
}
/**
* 测试取得连接地址和标题
* @throws ParserException
*/
public void testFetchLinkAndTitle() throws ParserException{
String html = openFile("F:\\4.htm","UTF-8");
String result = "";
Map<String, String> map = new HashMap<String, String>();
map.put("class", "m_list");
Map<String, String> notMap = new HashMap<String, String>();
//notMap.put("class", "atc_ic_f");
result = getHtmlByFilter(map,notMap,html);
System.out.println("=============================result============================");
System.out.println(result);
System.out.println("==========================================================");
Pattern pt = Pattern.compile("<a.*href=\"(.*?)\".*>(.*?)</a>");

Matcher m = pt.matcher(result);
String link = null;
String title = null;
while (m.find()) {
link = m.group(1);
title = m.group(2);
if (StringUtils.isNotEmpty(link)) {
System.out.println("url : " + link);
System.out.println("title : " + title);
}
}
}
/**
* 测试取得内容
* @throws ParserException
*/
public void testFetchContent() throws ParserException{
String html = openFile("F:\\6.shtml","GB2312");
Map<String, String> map = new HashMap<String, String>();
map.put("id", "artibody");
Map<String, String> notMap = new HashMap<String, String>();
notMap.put(SINGLE_TAG, "style|script");
notMap.put("type", "text/javascript");
notMap.put("class", "icon_fx|blkComment otherContent_01");
notMap.put("style", "text-align: right;padding-right:10px;|margin-top:6px;|font-size: 12px ! important;|font-size:12px");
notMap.put("id", "fxwb|fxMSN|fxMSN|comment_t_show_top");
getHtmlByFilter(map,notMap,html);
}
/**
* 测试解析参数
*/
public void testParseParam(){
Map<String, String> map = new HashMap<String, String>();
populateParamMap(map,"class=articleList|tips,p,div");
String tempKey = null;
String tempValue = null;
String[] tempValueArr = null;
for (Iterator<String> it = map.keySet().iterator(); it.hasNext();) {
tempKey = it.next();
tempValue = map.get(tempKey);
if(tempValue.contains("|")){
tempValueArr = tempValue.split("\\|");
}else{
tempValueArr = new String[]{tempValue};
}
for(String value : tempValueArr){
System.out.println("tempKey:" + tempKey);
System.out.println("value:" + value);
}
}
}
/**
* 测试过滤标签
* @throws ParserException
*/
public void testRemarkFilter() throws ParserException{
String html = openFile("F:\\6.shtml","GB2312");
System.out.println("=========================过滤注释前HTML==================================");
System.out.println(html);
NodeFilter filter = new NodeClassFilter(RemarkNode.class);
html = removeHtmlByFilter(new Parser(), filter, html);
System.out.println("=========================过滤注释后HTML==================================");
System.out.println(html);
}
public static void main(String[] args) throws ParserException,
URISyntaxException, IOException {
HtmlParserImpl parseHtmlTool = new HtmlParserImpl(new CmsAcquisition());
//parseHtmlTool.testParseParam();
//parseHtmlTool.testFetchLinkAndTitle();
//parseHtmlTool.testFetchContent();
//parseHtmlTool.testRemarkFilter();
}

}
采集参数封装bean:ParamBean.java
package com.jeecms.cms.service;

import java.util.HashMap;
import java.util.Map;
/**
* 采集参数封装bean
* @author javacoo
* @since 2011-10-31
*/
public class ParamBean {
/**待采集连接区域属性MAP*/
private Map<String, String> linksetStartMap = new HashMap<String, String>();
/**待采集连接区域过滤属性MAP*/
private Map<String, String> linksetEndMap = new HashMap<String, String>();
/**待采集内容区域属性MAP*/
private Map<String, String> contentStartMap = new HashMap<String, String>();
/**待采集内容区域过滤属性MAP*/
private Map<String, String> contentEndMap = new HashMap<String, String>();

public Map<String, String> getLinksetStartMap() {
return linksetStartMap;
}
public void setLinksetStartMap(Map<String, String> linksetStartMap) {
this.linksetStartMap = linksetStartMap;
}
public Map<String, String> getLinksetEndMap() {
return linksetEndMap;
}
public void setLinksetEndMap(Map<String, String> linksetEndMap) {
this.linksetEndMap = linksetEndMap;
}
public Map<String, String> getContentStartMap() {
return contentStartMap;
}
public void setContentStartMap(Map<String, String> contentStartMap) {
this.contentStartMap = contentStartMap;
}
public Map<String, String> getContentEndMap() {
return contentEndMap;
}
public void setContentEndMap(Map<String, String> contentEndMap) {
this.contentEndMap = contentEndMap;
}


}
队列类:Queue.java
package com.jeecms.cms.service;

import java.util.LinkedList;
/**
* 队列
* @author javacoo
* @since 2011-11-01
* @param <T>
*/
public class Queue<T> {
private LinkedList<T> queue = new LinkedList<T>();
/**
* 入队列
* @param t
*/
public void enQueue(T t){
queue.addLast(t);
}
/**
* 出队列
* @return t
*/
public T deQueue(){
return queue.removeFirst();
}
/**
* 判断队列是否为空
* @return
*/
public boolean isEmpty(){
return queue.isEmpty();
}
/**
* 判断队列是否含有t
* @param t
* @return
*/
public boolean contains(T t){
return queue.contains(t);
}
/**
* 取得队列大小
* @return
*/
public int getSize(){
return queue.size();
}

}
URL队列:UrlQueue.java

package com.jeecms.cms.service;

import java.util.Map;

import org.springframework.util.CollectionUtils;

/**
* URL队列
* @author javacoo
* @since 2011-11-01
* @param <T>
*/
public class UrlQueue {
/**待访问URL集合*/
private Queue<Map<String, String>> unVisitedUrl = new Queue<Map<String, String>>();

/**
* 获得 URL 队列
* @return
*/
public Queue<Map<String, String>> getUnVisitedUrl() {
return unVisitedUrl;
}
/**
* 未访问的 URL 出队列
* @return
*/
public Map<String, String> unVisitedUrlDeQueue() {
return unVisitedUrl.deQueue();
}
/**
* 保证每个 URL 只被访问一次
* @param url
*/
public void addUnVisitedUrl(Map<String, String> urlMap) {
if (!CollectionUtils.isEmpty(urlMap) && !unVisitedUrl.contains(urlMap)){
unVisitedUrl.enQueue(urlMap);
}
}
/**
* 判断是否为空
* @return
*/
public boolean isEmpty(){
return unVisitedUrl.isEmpty();
}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值