在已有的ssh框架中,增加一个spring配置文件applicationContext-compassConfig.xml
文件内容:
<?xml version="1.0" encoding="UTF-8"?> <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> <!-- compass2.2 config start --> <bean id="annotationConfiguration" class="org.compass.annotations.config.CompassAnnotationsConfiguration"></bean> <bean id="compass" class="org.compass.spring.LocalCompassBean"> <property name="resourceDirectoryLocations"> <list> <value>classpath:com/tjsoft/SearchEngines</value> </list> </property> <!-- 定义索引的存储位置 --> <property name="connection"> <value>/lucene/indexes</value> </property> <!-- anontaition式设置 --> <property name="classMappings"> <list> <!-- compass测试类 --> <value>com.tjsoft.SearchEngines.compass.model.Compassbean</value> <value>com.tjsoft.SearchEngines.compass.model.TextFile</value> <value>com.tjsoft.SearchEngines.compass.model.ExternalDBBean</value> </list> </property> <property name="compassConfiguration" ref="annotationConfiguration"/> <property name="compassSettings"> <props> <!-- 建立索引位置的另一种方式 <prop key="compass.engine.connection"> file://${user.home}/lucene/indexes </prop> --> <prop key="compass.engine.mergeFactor">100</prop> <prop key="compass.engine.maxBufferedDocs">1000</prop> <prop key="compass.engine.maxFieldLength">100000</prop> <prop key="compass.transaction.factory">org.compass.spring.transaction.SpringSyncTransactionFactory</prop> <prop key="compass.engine.highlighter.default.formatter.simple.pre"><![CDATA[<span style='background-color:yellow;color:red;'>]]></prop> <prop key="compass.engine.highlighter.default.formatter.simple.post"><![CDATA[</span>]]></prop> <!-- 指定摘要文本的长度 --> <prop key="compass.engine.highlighter.default.fragmenter.simple.size">200</prop> <!-- 定义分词器 --> <prop key="compass.engine.analyzer.default.type">net.paoding.analysis.analyzer.PaodingAnalyzer</prop> </props> </property> <property name="transactionManager" ref="transactionManager"/> </bean> <bean id="hibernateGpsDevice" class="org.compass.spring.device.hibernate.dep.SpringHibernate3GpsDevice"> <property name="name" value="hibernateDevice"/> <property name="sessionFactory" ref="sessionFactory"/> <property name="mirrorDataChanges" value="true"/> </bean> <!-- 同步更新索引 --> <bean id="compassGps" class="org.compass.gps.impl.SingleCompassGps" init-method="start" destroy-method="stop"> <property name="compass" ref="compass"/> <property name="gpsDevices"> <list> <ref local="hibernateGpsDevice"/> </list> </property> </bean> <!-- compass模板 --> <bean id="compassTemplate" class="org.compass.core.CompassTemplate"> <property name="compass" ref="compass" /> </bean> <!-- 定时重建索引(利用quartz)或随Spring ApplicationContext启动而重建索引 --> <bean id="compassIndexBuilder" class="com.tjsoft.SearchEngines.compass.service.imp.CompassIndexBuilderImp" lazy-init="false"> <property name="compassGps" ref="compassGps" /> <property name="buildIndex" value="false" /> <property name="compassTemplate" ref="compassTemplate" /> <property name="compassService" ref="compassService" /> </bean> <!-- compass2.2 config end --> </beans>
最后一个bean中引用的com.tjsoft.SearchEngines.compass.service.imp.CompassIndexBuilderImp 代码如下
package com.tjsoft.SearchEngines.compass.service.imp;
import java.io.File;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.naming.NameNotFoundException;
import javax.sql.DataSource;
import org.apache.log4j.Logger;
import org.compass.core.CompassTemplate;
import org.compass.gps.CompassGps;
import org.springframework.beans.factory.InitializingBean;
import com.tjsoft.SearchEngines.common.util.IDFactory;
import com.tjsoft.SearchEngines.common.util.PreparatorUtil;
import com.tjsoft.SearchEngines.compass.dbConnection.ConnectionFactory;
import com.tjsoft.SearchEngines.compass.dbConnection.ConnectionParam;
import com.tjsoft.SearchEngines.compass.model.Compassbean;
import com.tjsoft.SearchEngines.compass.model.ExternalDBBean;
import com.tjsoft.SearchEngines.compass.model.TCompassIndexinfo;
import com.tjsoft.SearchEngines.compass.model.TCompassIndexinfoSql;
import com.tjsoft.SearchEngines.compass.model.TextFile;
import com.tjsoft.SearchEngines.compass.service.CompassIndexBuilder;
import com.tjsoft.SearchEngines.compass.service.TCompassService;
/**
* 系统启动后自动创建索引
* @author wbin
*
*/
public class CompassIndexBuilderImp implements InitializingBean,CompassIndexBuilder {
Logger logger = Logger.getLogger ( CompassIndexBuilderImp.class.getName() ) ;
// 是否要建立索引,可被设置为false使Builder失效
private boolean buildIndex;
// Compass封装
private CompassGps compassGps;
private CompassTemplate compassTemplate;
private TCompassService compassService;
// 索引操作线程延时启动的时间,单位为秒
private int lazyTime = 10;
// 索引线程
private Thread indexThread = new Thread() {
@SuppressWarnings("static-access")
@Override
public void run() {
long beginTime = System.currentTimeMillis();
try {
indexThread.sleep(lazyTime*1000);
System.out.println("搜索引擎开始创建索引...");
logger.info("搜索引擎开始创建索引...");
// 重建索引.
// 如果compass实体中定义的索引文件已存在,索引过程中会建立临时
// 索引完成后再进行覆盖.
compassGps.index();
//文件索引
List<TCompassIndexinfo> infoList = compassService.getIndexTargetList();
for(TCompassIndexinfo info : infoList)
{
//为文件类型的目标建立索引
if(info.getTargetType().equals("file"))
{
String dir = info.getIndextargetpath();
index(dir) ;
}
else if(info.getTargetType().equals("db"))
{
//1.登记一个连接池对象,该操作在程序初始化只做一次即可
ConnectionParam param = new ConnectionParam(info.getDataSourceName(),info.getDriver(),info.getUrl(),
info.getUserName(),info.getPassword(),
1,5,20000,false,100,"");
ConnectionFactory cFactory = ConnectionFactory.getInstance();
try
{
cFactory.bind(info.getDataSourceName(), param,false);
//2.以后都可以如此通过数据源名称直接得到数据源
DataSource ds = cFactory.lookUp(info.getDataSourceName());
Connection conn = ds.getConnection();
try
{
PreparedStatement ps = null;
ResultSet res = null;
for(Iterator<TCompassIndexinfoSql> ite = info.getCompassIndexinfoSqls().iterator();ite.hasNext();)
{
TCompassIndexinfoSql compassIndexinfoSql = ite.next();
ps = conn.prepareStatement(compassIndexinfoSql.getSqlstr());
res = ps.executeQuery();
indexDbRs(res);
}
}
finally
{
try
{
conn.close();
}
catch(Exception e)
{
logger.error(e.getMessage());
}
}
}
catch(Exception e)
{
logger.error(e.getMessage());
}
finally
{
try
{
cFactory.unbind(info.getDataSourceName());
}
catch (NameNotFoundException e)
{
logger.error(e.getMessage());
}
logger.info("释放数据源");
}
}
}
} catch (InterruptedException e1) {
e1.printStackTrace();
}
long costTime = System.currentTimeMillis() - beginTime;
System.out.println("创建索引完成。");
System.out.println("耗时 " + costTime + " 毫秒");
logger.info("创建索引完成。");
logger.info("耗时 " + costTime + " 毫秒");
}
};
/*
* 实现InitializingBean接口,在完成注入后创建索引.
*/
public void afterPropertiesSet() throws Exception {
if (buildIndex) {
indexThread.setDaemon(true);
indexThread.setName("Compass Indexer");
indexThread.start();
}
}
public void restartUpdateIndex()
{
indexThread.run();
}
/*
* 给单个文件建索引
*/
private void indexFile(File file) {
try
{
String sb = "";
String fileType = PreparatorUtil.getfiletypeByFile(file);
if(fileType != null)
{
/** ********穿透Txt,sql,java文件**************** */
if(fileType.equals("txt") || fileType.equals("sql") || fileType.equals("java"))
sb = PreparatorUtil.readTxt(file.getAbsolutePath());
/** ********穿透PDF文件**************** */
else if(fileType.equals("pdf"))
sb = PreparatorUtil.readPdf(file.getAbsolutePath());
/** ********穿透Office文件**************** */
else if(fileType.equals("doc")|| fileType.equals("docx") || fileType.equals("pst")
|| fileType.equals("xls") || fileType.equals("xlsx")
|| fileType.equals("pptx") || fileType.equals("ppt"))
sb = PreparatorUtil.readOffic(file.getAbsolutePath());
/** ********穿透Html文件**************** */
else if(fileType.equals("html") || fileType.equals("htm"))
{
sb = PreparatorUtil.readHtml(file.getAbsolutePath());
sb = PreparatorUtil.html2text(sb.toString());
}
/** ********其他文件,只读取文件名和路径**************** */
else
sb = file.getAbsolutePath();
}
else
{
sb = file.getAbsolutePath();
}
/** *********包装成对象************* */
TextFile tf = new TextFile();
tf.setFileId(IDFactory.getId());
tf.setTitle(file.getName());
// tf.setPath(file.getAbsolutePath());
tf.setPath(file.getCanonicalPath().replaceAll("\\\\","/"));
tf.setContent(sb.toString());
tf.setLastModifyTime(file.lastModified());
tf.setFileType(fileType);
/** *********索引对象**************** */
compassTemplate.create(tf);
}
catch (Exception e) {
logger.error("读取文件:"+file.getAbsolutePath()+"出错!");
logger.error(e.getMessage());
}
}
/*
* 给目录下的所有文件建索引
*/
private void index(File file) {
if (file.isFile()) { // 如果是文件就建索引并保存
indexFile(file);
return;
}
File[] childs = file.listFiles();
if(childs == null) return;
for (int i = 0; i < childs.length; i++) {
File f = childs[i];
if (f.isDirectory()) {// 如果是目录就递归调用
index(f);
} else {
indexFile(f);
}
}
}
/*
* 给目录下的所有文件建索引
*/
public void index(String filePath) {
File file = null;
file = new File(filePath);
index(file);
}
/*
* 删除索引 删除索引是根据索引的id来删除
*/
public void unIndex(File file) {
Compassbean tf = new Compassbean();
tf.setPath(file.getAbsolutePath());
compassTemplate.delete(tf);
}
/*
*为数据库记录集创建索引
*/
public void indexDbRs(ResultSet res)
{
if (res != null)
{
try {
while(res.next())
{
ExternalDBBean externalDBBean = new ExternalDBBean();
externalDBBean.setBeanID(IDFactory.getId());
List<String> parmList = new ArrayList<String>();
for(int i=0;i<res.getMetaData().getColumnCount();i++)
{
parmList.add(res.getString(i+1));
}
externalDBBean.setPramList(parmList);
externalDBBean.setFileType("otherDB");
/** *********索引对象**************** */
compassTemplate.create(externalDBBean);
}
} catch (SQLException e) {
logger.error(e.getMessage());
}
}
}
/**
*
* @param buildIndex
*/
public void setBuildIndex(boolean buildIndex) {
this.buildIndex = buildIndex;
}
/**
*
* @param compassGps
*/
public void setCompassGps(CompassGps compassGps) {
this.compassGps = compassGps;
}
public CompassTemplate getCompassTemplate() {
return compassTemplate;
}
public void setCompassTemplate(CompassTemplate compassTemplate) {
this.compassTemplate = compassTemplate;
}
public TCompassService getCompassService() {
return compassService;
}
public void setCompassService(TCompassService compassService) {
this.compassService = compassService;
}
}
接口为
public interface CompassIndexBuilder {
public void afterPropertiesSet() throws Exception ;
public void index(String filePath);
public void restartUpdateIndex();
}
读取文件使用到的工具类PreparatorUtil
package com.tjsoft.SearchEngines.common.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.xmlbeans.XmlException;
/**
* 文件穿透
* @author wbin
*
*/
public class PreparatorUtil {
public final static Map<String, String> FILE_TYPE_MAP = new HashMap<String, String>();
private PreparatorUtil(){}
static{
getAllFileType(); //初始化文件类型信息
}
/**
* Created on 2011-02-15
* <p>Discription:[getAllFileType,常见文件头信息]</p>
* @author:wbin
*/
private static void getAllFileType()
{
FILE_TYPE_MAP.put("jpg", "FFD8FF"); //JPEG (jpg)
FILE_TYPE_MAP.put("png", "89504E47"); //PNG (png)
FILE_TYPE_MAP.put("gif", "47494638"); //GIF (gif)
FILE_TYPE_MAP.put("tif", "49492A00"); //TIFF (tif)
FILE_TYPE_MAP.put("bmp", "89504E470D0A1A0A0000000D4948445200000060000000600806000000E2987738000000017352474200AECE1CE900000004"); //Windows Bitmap (bmp)
FILE_TYPE_MAP.put("dwg", "41433130"); //CAD (dwg)
FILE_TYPE_MAP.put("html", "68746D6C3E"); //HTML (html)
FILE_TYPE_MAP.put("htm", "3C21444F435459504520"); //HTML (html)
FILE_TYPE_MAP.put("rtf", "7B5C727466"); //Rich Text Format (rtf)
FILE_TYPE_MAP.put("xml", "3C3F786D6C");
FILE_TYPE_MAP.put("zip", "504B03040A00000000009");
FILE_TYPE_MAP.put("rar", "52617221");
FILE_TYPE_MAP.put("psd", "38425053"); //Photoshop (psd)
FILE_TYPE_MAP.put("eml", "44656C69766572792D646174653A"); //Email [thorough only] (eml)
FILE_TYPE_MAP.put("dbx", "CFAD12FEC5FD746F"); //Outlook Express (dbx)
FILE_TYPE_MAP.put("pst", "2142444E"); //Outlook (pst)
FILE_TYPE_MAP.put("xls", "D0CF11E0A1B11AE1000000000000000000000000000000003E000300FEFF0900060000000000000000000000010000000100"); //MS Word
FILE_TYPE_MAP.put("xlsx", "504B030414000600080000002100C8A3");
FILE_TYPE_MAP.put("doc", "D0CF11E0A1B11AE1000000000000000000000000000000003E000300FEFF09000600000000000000000000004E0000005600");
FILE_TYPE_MAP.put("docx", "504B030414000600080000002100729");
FILE_TYPE_MAP.put("pptx", "504B03041400060008000000210036F7");
FILE_TYPE_MAP.put("ppt", "D0CF11E0A1B11AE1000000000000000000000000000000003E000300FEFF0900060000000000000000000000020000000100");
FILE_TYPE_MAP.put("mdb", "5374616E64617264204A"); //MS Access (mdb)
FILE_TYPE_MAP.put("wpd", "FF575043"); //WordPerfect (wpd)
FILE_TYPE_MAP.put("eps", "252150532D41646F6265");
FILE_TYPE_MAP.put("ps", "252150532D41646F6265");
FILE_TYPE_MAP.put("pdf", "255044462D312E"); //Adobe Acrobat (pdf)
FILE_TYPE_MAP.put("qdf", "AC9EBD8F"); //Quicken (qdf)
FILE_TYPE_MAP.put("pwl", "E3828596"); //Windows Password (pwl)
FILE_TYPE_MAP.put("wav", "57415645"); //Wave (wav)
FILE_TYPE_MAP.put("avi", "41564920");
FILE_TYPE_MAP.put("ram", "2E7261FD"); //Real Audio (ram)
FILE_TYPE_MAP.put("rm", "2E524D46"); //Real Media (rm)
FILE_TYPE_MAP.put("mpg", "000001BA"); //
FILE_TYPE_MAP.put("mov", "6D6F6F76"); //Quicktime (mov)
FILE_TYPE_MAP.put("asf", "3026B2758E66CF11"); //Windows Media (asf)
FILE_TYPE_MAP.put("mid", "4D546864"); //MIDI (mid)
FILE_TYPE_MAP.put("sql", "73656C656374200D0");
FILE_TYPE_MAP.put("txt", "73656C6563742032303");
FILE_TYPE_MAP.put("java", "7061636B61676520636F6D2E");
}
/**
* 穿透offic文档
* @param path
* @return
*/
public static String readOffic(String path) {
File inputFile = new File(path);
POITextExtractor extractor = null;
try {
extractor = ExtractorFactory.createExtractor(inputFile);
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
} catch (XmlException e) {
e.printStackTrace();
}
return extractor.getText().trim();
}
/**
* 穿透PDF文件
* @param path
* @return
* @throws Exception
*/
public static String readPdf(String path) throws Exception {
StringBuffer content = new StringBuffer("");// 文档内容
PDDocument pdfDocument = null;
try {
FileInputStream fis = new FileInputStream(path);
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(fis);
StringWriter writer = new StringWriter();
stripper.writeText(pdfDocument, writer);
content.append(writer.getBuffer().toString());
fis.close();
} catch (java.io.IOException e) {
System.err.println("IOException=" + e);
System.exit(1);
} finally
{
if (pdfDocument != null) {
COSDocument cos = pdfDocument.getDocument();
cos.close();
pdfDocument.close();
}
}
return content.toString().trim();
}
/**
* 穿透html 保留html标签和css样式
* @param urlString
* @return
*/
public static String readHtml(String urlString) {
StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
BufferedReader reader = null;
try {
fis = new FileInputStream(file);
// 读取页面
reader = new BufferedReader(new InputStreamReader(fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
String line = null;
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
} catch (Exception e) {
e.printStackTrace();
}
finally
{
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
String contentString = content.toString();
return contentString;
}
/**
* 穿透txt
* @param path
* @return
*/
public static String readTxt(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
FileReader reader = null;
BufferedReader br = null;
try {
reader = new FileReader(path);
br = new BufferedReader(reader);
String s1 = null;
while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
} catch (IOException e) {
e.printStackTrace();
}
finally
{
try {
br.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return content.toString().trim();
}
public static String getFileHexString(byte[] b)
{
StringBuilder stringBuilder = new StringBuilder();
if (b == null || b.length <= 0)
{
return null;
}
for (int i = 0; i < b.length; i++)
{
int v = b[i] & 0xFF;
String hv = Integer.toHexString(v);
if (hv.length() < 2)
{
stringBuilder.append(0);
}
stringBuilder.append(hv);
}
return stringBuilder.toString();
}
public static String getFileTypeByStream(byte[] b)
{
String filetypeHex = String.valueOf(getFileHexString(b));
Iterator<Entry<String, String>> entryiterator = FILE_TYPE_MAP.entrySet().iterator();
while (entryiterator.hasNext()) {
Entry<String,String> entry = entryiterator.next();
String fileTypeHexValue = entry.getValue();
if (filetypeHex.toUpperCase().startsWith(fileTypeHexValue)) {
return entry.getKey();
}
}
return null;
}
/**
* 判断文件类型
* @param file
* @return
*/
public static String getfiletypeByFile(File file)
{
String filetype = null;
byte[] b = new byte[50];
InputStream is = null;
try
{
is = new FileInputStream(file);
is.read(b);
filetype = getFileTypeByStream(b);
}
catch (FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return filetype;
}
//---------html 去掉标签和css样式等 start-------
public static String html2text(String html) {
StringBuffer sb = new StringBuffer(html.length());
char[] data = html.toCharArray();
int start = 0;
boolean previousIsPre = false;
Token token = null;
for(;;) {
token = parse(data, start, previousIsPre);
if(token==null)
break;
previousIsPre = token.isPreTag();
sb = sb.append(token.getText());
start += token.getLength();
}
return sb.toString();
}
private static Token parse(char[] data, int start, boolean previousIsPre) {
if(start>=data.length)
return null;
// try to read next char:
char c = data[start];
if(c=='<') {
// this is a tag or comment or script:
int end_index = indexOf(data, start+1, '>');
if(end_index==(-1)) {
// the left is all text!
return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
}
String s = new String(data, start, end_index-start+1);
// now we got s="<...>":
if(s.startsWith("<!--")) { // this is a comment!
int end_comment_index = indexOf(data, start+1, "-->");
if(end_comment_index==(-1)) {
// illegal end, but treat as comment:
return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);
}
else
return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);
}
String s_lowerCase = s.toLowerCase();
if(s_lowerCase.startsWith("<script")) { // this is a script:
int end_script_index = indexOf(data, start+1, "</script>");
if(end_script_index==(-1))
// illegal end, but treat as script:
return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);
else
return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);
}
else { // this is a tag:
return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);
}
}
// this is a text:
int next_tag_index = indexOf(data, start+1, '<');
if(next_tag_index==(-1))
return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);
}
private static int indexOf(char[] data, int start, String s) {
char[] ss = s.toCharArray();
// TODO: performance can improve!
for(int i=start; i<(data.length-ss.length); i++) {
// compare from data[i] with ss[0]:
boolean match = true;
for(int j=0; j<ss.length; j++) {
if(data[i+j]!=ss[j]) {
match = false;
break;
}
}
if(match)
return i;
}
return (-1);
}
private static int indexOf(char[] data, int start, char c) {
for(int i=start; i<data.length; i++) {
if(data[i]==c)
return i;
}
return (-1);
}}@SuppressWarnings("unchecked")
class Token {
public static final int TOKEN_TEXT = 0; // html text.
public static final int TOKEN_COMMENT = 1; // comment like <!--
// comments... -->
public static final int TOKEN_TAG = 2; // tag like <pre>, <font>,
// etc.
public static final int TOKEN_SCRIPT = 3; private static final char[] TAG_BR = "<br".toCharArray();
private static final char[] TAG_P = "<p".toCharArray();
private static final char[] TAG_LI = "<li".toCharArray();
private static final char[] TAG_PRE = "<pre".toCharArray();
private static final char[] TAG_HR = "<hr".toCharArray();
private static final char[] END_TAG_TD = "</td>".toCharArray();
private static final char[] END_TAG_TR = "</tr>".toCharArray();
private static final char[] END_TAG_LI = "</li>".toCharArray();
private static final Map SPECIAL_CHARS = new HashMap();
private int type;
private String html; // original html
private String text = null; // text!
private int length = 0; // html length
private boolean isPre = false; // isPre tag?
static {
// SPECIAL_CHARS.put(""", "\"");
SPECIAL_CHARS.put("<", "<");
SPECIAL_CHARS.put(">", ">");
SPECIAL_CHARS.put("&", "&");
SPECIAL_CHARS.put("?", "(r)");
SPECIAL_CHARS.put("?", "(c)");
SPECIAL_CHARS.put(" ", " ");
SPECIAL_CHARS.put("£", "?");
}
public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
this.type = type;
this.length = end - start;
this.html = new String(data, start, length);
//System.out.println("[Token] html=" + html + ".");
parseText(previousIsPre);
//System.out.println("[Token] text=" + text + ".");
} public int getLength() {
return length;
} public boolean isPreTag() {
return isPre;
} private void parseText(boolean previousIsPre) {
if(type==TOKEN_TAG) {
char[] cs = html.toCharArray();
if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
text = "\n";
else if(compareTag(TAG_LI, cs))
text = "\n* ";
else if(compareTag(TAG_PRE, cs))
isPre = true;
else if(compareTag(TAG_HR, cs))
text = "\n--------\n";
else if(compareString(END_TAG_TD, cs))
text = "\t";
else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
text = "\n";
}
// text token:
else if(type==TOKEN_TEXT) {
text = toText(html, previousIsPre);
}
}
public String getText() {
return text==null ? "" : text;
}
private String toText(String html, final boolean isPre) {
char[] cs = html.toCharArray();
StringBuffer buffer = new StringBuffer(cs.length);
int start = 0;
boolean continueSpace = false;
char current, next;
for(;;) {
if(start>=cs.length)
break;
current = cs[start]; // read current char
if(start+1<cs.length) // and next char
next = cs[start+1];
else
next = '\0';
if(current==' ') {
if(isPre || !continueSpace)
buffer = buffer.append(' ');
continueSpace = true;
// continue loop:
start++;
continue;
}
// not ' ', so:
if(current=='\r' && next=='\n') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start+=2;
continue;
}
if(current=='\n' || current=='\r') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start++;
continue;
}
// cannot continue space:
continueSpace = false;
if(current=='&') {
// maybe special char:
int length = readUtil(cs, start, ';', 10);
if(length==(-1)) { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
else { // check if special character:
String spec = new String(cs, start, length);
String specChar = (String)SPECIAL_CHARS.get(spec);
if(specChar!=null) { // special chars!
buffer = buffer.append(specChar);
// continue loop:
start+=length;
continue;
}
else { // check if like '?':
if(next=='#') { // maybe a char
String num = new String(cs, start+2, length-3);
try {
int code = Integer.parseInt(num);
if(code>0 && code<65536) { // this is a
// special char:
buffer = buffer.append((char)code);
// continue loop:
start++;
continue;
}
}
catch(Exception e) {}
// just normal char:
buffer = buffer.append("&#");
// continue loop:
start+=2;
continue;
}
else { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
}
}
}
else { // just a normal char!
buffer = buffer.append(current);
// continue loop:
start++;
continue;
}
}
return buffer.toString();
} // read from cs[start] util meet the specified char 'util',
// or null if not found:
private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {
int end = start+maxLength;
if(end>cs.length)
end = cs.length;
for(int i=start; i<start+maxLength; i++) {
if(cs[i]==util) {
return i-start+1;
}
}
return (-1);
} // compare standard tag "<input" with tag "<INPUT value=aa>"
private boolean compareTag(final char[] ori_tag, char[] tag) {
if(ori_tag.length>=tag.length)
return false;
for(int i=0; i<ori_tag.length; i++) {
if(Character.toLowerCase(tag[i])!=ori_tag[i])
return false;
}
// the following char should not be a-z:
if(tag.length>ori_tag.length) {
char c = Character.toLowerCase(tag[ori_tag.length]);
if(c<'a' || c>'z')
return true;
return false;
}
return true;
}
private boolean compareString(final char[] ori, char[] comp) {
if(ori.length>comp.length)
return false;
for(int i=0; i<ori.length; i++) {
if(Character.toLowerCase(comp[i])!=ori[i])
return false;
}
return true;
} public String toString() {
return html;
}
//------------------end ------------------
}