-------------------20080731------------------
修改前端界面,修改成类似baidu的风格
修改index.jsp和results.jsp
研究IndexHTML.java
1)在meta中有中文,就解析不过去了
<meta name="keywords" content="范怡文,歌手,董事长,电子商务,b2b,商业,金融,商界精英,创业,商机,信用记录,信网,中国信网,信用记录" >
<meta name="description" content="关注产业经济新闻,宣传商界精英,剖析成功案例,研究电子商务,提供创业商机!">
2)在注释中有中文,也解析不去
<!--导航开始-->
考虑把<meta和<!--从解析中去掉 //todo
------------------20080801----------------------
lucene自带的htmlparse功能太弱,研究sourceforge上的专业htmlparse
1)已经完成了htmlparse的集成工作,很方便使用,写了一个包装类SimpleHtmlparser,详见附录二
2)完成lucene的高亮处理
--------------------20080802-----------------------
1)查询时排序功能需要完善,使用了默认排序,需要自己重新开发
需要解决的问题
1)思路:查询时,当结果集相当大时,只显示前几页就可以了,尽量提高查询出来结果的可用性,具体在BooleanQuery上修改
2)思路:开多线程去生成索引,然后合并索引,以提高索引效率?
3)问题:查全率,查准率,死链等,垃圾信息
---------------20080804------------
1)完成了对于summary的高亮显示,但现在的处理方式是把body下面的汉字都存储下来了,看是否还需要进一步改进它
2)研究html,网站里面共同的部分如导航条、logo、版权等信息(这些称之为网页的“噪音”)
3)研究URL去重,可以考虑用bloomfilter来处理
4)消除html"噪音"
-----------20080805---------
1)搞定:噪音 利用NodeFilter tagFilter = new NodeClassFilter(TagNode.class);
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
OrFilter orFilter = new OrFilter(tagFilter,textFilter);
结合HtmlNoise来实现,isherf,isscript,iscopyright
这个还需要进一步细化,先实现到这里
package org.apache.lucene.demo;
import java.io.File;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
public class SimpleHtmlparser {
public static String DEFAULT_ENCODE="UTF-8";
private String summary;
private String title;
private String body;
private String meta;
private String content;
public static void main(String args[]) throws ParserException {
SimpleHtmlparser sh = new SimpleHtmlparser("E:/workspace/searchengine/heritrix/heritrix-1.14.0/target/heritrix-1.14.0/bin/heritrix-1.14.0/jobs/default3-20080730031933671/mirror/news.21315.com/2008/aygz/2008-07-30/8721.html");
System.out.println(sh.getContent());
}
public SimpleHtmlparser(String resource){
this(resource,DEFAULT_ENCODE);
}
public SimpleHtmlparser(String resource,String encoding){
try{
if(encoding==null)encoding = DEFAULT_ENCODE;
parseResource(resource,encoding);
}catch(Exception e){
System.out.println("SimpleHtmlparser error :"+e.getMessage());
}
}
/**
* 处理所有的页面内容,定义好title,body,meta,summary
* @param resource
* @param encoding
* @throws ParserException
*/
public void parseResource(String resource,String encoding)throws ParserException{
StringBuffer summarybuffer = new StringBuffer();//用来存summary
StringBuffer contentbuffer = new StringBuffer();//用来存lucene要用的内容文档
Parser parser;
parser = new Parser(resource);
parser.setEncoding(encoding);
HtmlPage htmlpage = new HtmlPage(parser);
parser.visitAllNodesWith(htmlpage);
this.title = htmlpage.getTitle();
this.body = htmlpage.getBody().toHtml();
Parser nodesParser;
NodeList nodeList = null;
nodesParser = Parser.createParser(body, encoding);
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
try{
nodeList = nodesParser.parse(textFilter);
}catch (ParserException e){
e.printStackTrace();
}
if (null == nodeList){
System.out.println(" ");
}
summarybuffer.append(title);
// contentbuffer.append(title);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++){
Node nextNode = (Node) nodes[i];
String tmpcontent = "";
if (nextNode instanceof TextNode){
TextNode textnode = (TextNode) nextNode;
tmpcontent = textnode.getText();
}
contentbuffer.append(tmpcontent);
}
this.content = contentbuffer.toString();
this.summary = summarybuffer.toString();
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
public String getMeta() {
return meta;
}
public void setMeta(String meta) {
this.meta = meta;
}
}
修改前端界面,修改成类似baidu的风格
修改index.jsp和results.jsp
研究IndexHTML.java
1)在meta中有中文,就解析不过去了
<meta name="keywords" content="范怡文,歌手,董事长,电子商务,b2b,商业,金融,商界精英,创业,商机,信用记录,信网,中国信网,信用记录" >
<meta name="description" content="关注产业经济新闻,宣传商界精英,剖析成功案例,研究电子商务,提供创业商机!">
2)在注释中有中文,也解析不去
<!--导航开始-->
考虑把<meta和<!--从解析中去掉 //todo
------------------20080801----------------------
lucene自带的htmlparse功能太弱,研究sourceforge上的专业htmlparse
1)已经完成了htmlparse的集成工作,很方便使用,写了一个包装类SimpleHtmlparser,详见附录二
2)完成lucene的高亮处理
--------------------20080802-----------------------
1)查询时排序功能需要完善,使用了默认排序,需要自己重新开发
需要解决的问题
1)思路:查询时,当结果集相当大时,只显示前几页就可以了,尽量提高查询出来结果的可用性,具体在BooleanQuery上修改
2)思路:开多线程去生成索引,然后合并索引,以提高索引效率?
3)问题:查全率,查准率,死链等,垃圾信息
---------------20080804------------
1)完成了对于summary的高亮显示,但现在的处理方式是把body下面的汉字都存储下来了,看是否还需要进一步改进它
2)研究html,网站里面共同的部分如导航条、logo、版权等信息(这些称之为网页的“噪音”)
3)研究URL去重,可以考虑用bloomfilter来处理
4)消除html"噪音"
-----------20080805---------
1)搞定:噪音 利用NodeFilter tagFilter = new NodeClassFilter(TagNode.class);
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
OrFilter orFilter = new OrFilter(tagFilter,textFilter);
结合HtmlNoise来实现,isherf,isscript,iscopyright
这个还需要进一步细化,先实现到这里
package org.apache.lucene.demo;
import java.io.File;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
public class SimpleHtmlparser {
public static String DEFAULT_ENCODE="UTF-8";
private String summary;
private String title;
private String body;
private String meta;
private String content;
public static void main(String args[]) throws ParserException {
SimpleHtmlparser sh = new SimpleHtmlparser("E:/workspace/searchengine/heritrix/heritrix-1.14.0/target/heritrix-1.14.0/bin/heritrix-1.14.0/jobs/default3-20080730031933671/mirror/news.21315.com/2008/aygz/2008-07-30/8721.html");
System.out.println(sh.getContent());
}
public SimpleHtmlparser(String resource){
this(resource,DEFAULT_ENCODE);
}
public SimpleHtmlparser(String resource,String encoding){
try{
if(encoding==null)encoding = DEFAULT_ENCODE;
parseResource(resource,encoding);
}catch(Exception e){
System.out.println("SimpleHtmlparser error :"+e.getMessage());
}
}
/**
* 处理所有的页面内容,定义好title,body,meta,summary
* @param resource
* @param encoding
* @throws ParserException
*/
public void parseResource(String resource,String encoding)throws ParserException{
StringBuffer summarybuffer = new StringBuffer();//用来存summary
StringBuffer contentbuffer = new StringBuffer();//用来存lucene要用的内容文档
Parser parser;
parser = new Parser(resource);
parser.setEncoding(encoding);
HtmlPage htmlpage = new HtmlPage(parser);
parser.visitAllNodesWith(htmlpage);
this.title = htmlpage.getTitle();
this.body = htmlpage.getBody().toHtml();
Parser nodesParser;
NodeList nodeList = null;
nodesParser = Parser.createParser(body, encoding);
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
try{
nodeList = nodesParser.parse(textFilter);
}catch (ParserException e){
e.printStackTrace();
}
if (null == nodeList){
System.out.println(" ");
}
summarybuffer.append(title);
// contentbuffer.append(title);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++){
Node nextNode = (Node) nodes[i];
String tmpcontent = "";
if (nextNode instanceof TextNode){
TextNode textnode = (TextNode) nextNode;
tmpcontent = textnode.getText();
}
contentbuffer.append(tmpcontent);
}
this.content = contentbuffer.toString();
this.summary = summarybuffer.toString();
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
public String getMeta() {
return meta;
}
public void setMeta(String meta) {
this.meta = meta;
}
}