一、环境
1、平台:myeclipse8.5
2、框架:Lucene2.9.4/htmlparser
二、开发调试
1、直接上源码吧,加了很多注释,应该可以看得明白,如下
package org.cyxl.lucene.test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
public class ParseURL {
//索引目录
private static final String INDEX_DIR = "myindex";
//已经存在的url列表
private static List<String> urls=new ArrayList<String>();
/**
* 索引器,对目标url创建索引
* @param url 目标网址
* @throws IOException
* @throws ParserException
*/
@SuppressWarnings("deprecation")
private static void indexer(String url) throws IOException, ParserException {
//判断是新创建索引文件,还是附加方式
boolean create=false;
//存储索引的目录
File indexDir = new File(INDEX_DIR);
//目录不存在,创建该目录
if (!indexDir.exists()) {
indexDir.mkdir();
}
//判断是否需要新创建索引文件
if(indexDir.list().length<=0)
{
create=true;
}
//获取网页纯文本
String content = getText(url);
//获取网页标题
String title = getTitle(url);
System.out.println("title:" + title);
if(title==null || content==null || content.trim().equals(""))
{
return;
}
// System.out.println("content:" + content);
// URL path=new URL(url);
// InputStream stream=path.openStream();
//
// Reader reader=new InputStreamReader(stream);
// Reader reader=new InputStreamReader(new ByteArrayInputStream(content.getBytes()));
// Reader reader2=new InputStreamReader(new ByteArrayInputStream(title.getBytes()));
Document doc = new Document();
//加入url域
doc.add(new Field("url", url, Field.Store.YES,
Field.Index.UN_TOKENIZED));
//加入标题域
doc.add(new Field("title", title, Field.Store.YES,
Field.Index.ANALYZED));
// doc.add(new Field("title",reader2));
//Index.ANALYZED分词后构建索引
//加入内容域
doc.add(new Field("content", content, Field.Store.YES,
Field.Index.ANALYZED));
// doc.add(new Field("content",reader));
//根据索引目录和create的状态创建索引写磁盘对象
IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),
new StandardAnalyzer(Version.LUCENE_CURRENT), create,
IndexWriter.MaxFieldLength.LIMITED);
writer.addDocument(doc);
writer.optimize();
writer.close();
//创建了索引的网址加入到已经存在的网址列表中
urls.add(url);
}
/**
* 搜索器,根据输入的文本去搜索
* @param words 输入的文本
* @throws CorruptIndexException
* @throws IOException
* @throws ParseException
*/
@SuppressWarnings("deprecation")
private static void searcher(String words) throws CorruptIndexException,
IOException, ParseException {
File indexDir = new File(INDEX_DIR);
//根据索引目录创建读索引对象
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir), true);
//搜索对象创建
Searcher searcher = new IndexSearcher(reader);
//分词
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
//指定搜索的域
String field="content";
//创建查询解析对象
QueryParser parser = new QueryParser(field, analyzer);
//根据域和目标搜索文本创建查询器
Query query = parser.parse(words);
System.out.println("Searching for: " + query.toString(field));
//对结果进行打分排序
TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 10,false);
searcher.search(query, collector);
//获取结果
ScoreDoc[] hits = collector.topDocs().scoreDocs;
int numTotalHits = collector.getTotalHits();
System.out.println(numTotalHits + " total matching documents");
//显示搜索结果
for (int i = 0; i < hits.length; i++) {
Document doc = searcher.doc(hits[i].doc);
String url = doc.get("url");
String title=doc.get("title");
String content=doc.get("content");
System.out.println((i + 1) + "." + title);
System.out.println("-----------------------------------");
System.out.println(content.substring(0,100)+"......");
System.out.println("-----------------------------------");
System.out.println(url);
System.out.println();
}
}
/**
* 收入网站
* @param url 网站首页url,也可以为网站地图url
* @throws ParserException
* @throws IOException
* @throws ParseException
*/
private static void addSite(String url) throws ParserException, IOException, ParseException
{
long start=System.currentTimeMillis();
System.out.println("start add...");
//获取目标网页的所有链接
List<String> links = getLinks(url);
System.out.println("url count:"+links.size());
for(int i=0;i<links.size();i++)
{
String link=links.get(i);
System.out.println((i+1)+"."+link);
if(!urls.contains(link))
{
//对未创建过索引的网页创建索引
indexer(link);
}
else
{
System.out.println("["+link+"] exist");
}
}
System.out.println("end...");
long end=System.currentTimeMillis();
System.out.println("cost "+(end-start)/1000+" seconds");
}
/**
* 获取网页纯文本
* @param url 目标网址
* @return
* @throws ParserException
*/
private static String getText(String url) throws ParserException {
StringBean sb = new StringBean();
// 设置不需要得到页面所包含的链接信息
sb.setLinks(false);
// 设置将不间断空格由正规空格所替代
sb.setReplaceNonBreakingSpaces(true);
// 设置将一序列空格由一个单一空格所代替
sb.setCollapse(true);
// 传入要解析的URL
sb.setURL(url);
// 返回解析后的网页纯文本信息
String content = sb.getStrings();
// System.out.println(content);
return content;
}
/**
* 获取网页标题
* @param path
* @return
* @throws IOException
* @throws ParserException
*/
private static String getTitle(String path) throws IOException,
ParserException {
String title = "";
try {
Parser parser=new Parser(path);
HtmlPage page = new HtmlPage(parser);
parser.visitAllNodesWith(page);
title=page.getTitle();
// URL url = new URL(path);
// URLConnection conn = url.openConnection();
// conn.setConnectTimeout(1000*5);
//
// InputStream stream = conn.getInputStream();
//
// byte[] bs = new byte[stream.available()];
// stream.read(bs);
// String str = new String(bs, "utf-8");
//
// List<String> list = new ArrayList<String>();
// Pattern pa = Pattern.compile("<title>.*?</title>");// 源码中标题正则表达式
// Matcher ma = pa.matcher(str);
// while (ma.find())// 寻找符合el的字串
// {
// list.add(ma.group());// 将符合el的字串加入到list中
// }
// for (int i = 0; i < list.size(); i++) {
// title = title + list.get(i);
// }
// title = title.replaceAll("<.*?>", "");
} catch (Exception e) {
// TODO Auto-generated catch block
//e.printStackTrace();
title = "no title";
}
return title.trim();
}
/**
* 获取网页中所有的链接
* @param url
* @return
* @throws ParserException
*/
private static List<String> getLinks(String url) throws ParserException
{
List<String> links=new ArrayList<String>();
//创建链接节点的过滤器
NodeFilter filter = new NodeClassFilter(LinkTag.class);
Parser parser = new Parser();
parser.setURL(url);
//设置目标网页的编码方式
//parser.setEncoding("utf-8");
//因为有些时候不清楚目标网页的编码方式,这里我们采用指定一
//个编码集合,然后通过试探的方式得到目标网页的编码方式
parser.setEncoding(CharsetAutoSwitch.dectedEncode(url));
NodeList list = parser.extractAllNodesThatMatch(filter);
for (int i = 0; i < list.size(); i++) {
LinkTag node = (LinkTag) list.elementAt(i);
//获取链接的目标网址
String link=node.extractLink();
if(link!=null && !link.trim().equals(""))
{
//将目标网址加入到该页面的所有网址列表中
links.add(link);
}
}
return links;
}
public static void main(String[] args) throws IOException, ParseException,
InterruptedException, ParserException {
String url = "http://struts.apache.org/";
//收入网站
addSite(url);
//搜有带有lucene词的网页
searcher("lucene");
}
}
2、这里用到了网上找的一个方法,就是获取网页的编码
package org.cyxl.lucene.test;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.Html;
import org.htmlparser.util.NodeIterator;
/**
* 根据网页的编码类型自动匹配编码,此方法从网上搜索的,准确性有待长时间测试
*
* @author Administrator
*
*/
public class CharsetAutoSwitch {
// 字符编码集合,可根据实际编码类型进行扩充,试探器会不断试探该字符
//编码集合直到得到正确的编码方式
private static final String oriEncode = "utf-8,gb2312,gbk,iso-8859-1";
/**
* 检测URL指定的网页的字符集
*
* @param url
* @return 返回网页的实际编码方式
*/
public static String dectedEncode(String url) {
String[] encodes = oriEncode.split(",");
for (int i = 0; i < encodes.length; i++) {
if (dectedCode(url, encodes[i])) {
String code = encodes[i];
System.out.println("code:" + code);
return code;
}
}
return null;
}
/**
* 编码匹配试探器,不断去试探utf-8,gb2312,gbk,iso-8859-1等编码方式,直到得到正确的结果
*
* @param url
* @param encode
* @return
*/
public static boolean dectedCode(String url, String encode) {
try {
Parser parser = new Parser(url);
parser.setEncoding(encode);
for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
Node node = (Node) e.nextNode();
// System.out.println(node.getClass());
if (node instanceof Html || node instanceof BodyTag) {
return true;
}
}
} catch (Exception e) {
}
return false;
}
}
3、测试结果
1)项目的根目录下创建了一个名为myindex的目录,里面的问价如下
2)搜索的结果为
Searching for: lucene
7 total matching documents
1.Home - Confluence
-----------------------------------
Home - Confluence
Dashboard > Bookstore > Home
Page Operations
View
Info
Browse Space
Pages
L......
-----------------------------------
http://www.ApacheBookstore.com/
2.Welcome to The Apache Software Foundation!
-----------------------------------
Welcome to The Apache Software Foundation!
The Apache Software Foundation
Community-led developmen......
-----------------------------------
http://www.apache.org/
3.Apache Struts Stats
-----------------------------------
Apache Struts Stats
apache > cocoon
Home
Stats
About
Index
Changes
FAQ
Apache Stats
Index
......
-----------------------------------
http://people.apache.org/~vgritsenko/stats/projects/struts
4.Thanks
-----------------------------------
Thanks
The Apache Software Foundation
Thanks
Foundation
Projects
People
Get Involved
Download......
-----------------------------------
http://apache.org/foundation/thanks.html
5.Apache Tapestry Home Page
-----------------------------------
Apache Tapestry Home Page
Home
Getting Started
Documentation
Download
About
Community
Apache
......
-----------------------------------
http://tapestry.apache.org/
6.Licenses
-----------------------------------
Licenses
The Apache Software Foundation
Licenses
Foundation
Projects
People
Get Involved
Down......
-----------------------------------
http://www.apache.org/licenses/
7.Sponsorship
-----------------------------------
Sponsorship
The Apache Software Foundation
Sponsorship
Foundation
Projects
People
Get Involved......
-----------------------------------
http://apache.org/foundation/sponsorship.html
三、总结
1、该程序初步完成三个功能:网页爬虫、索引器和搜索器
2、该程序中收录的网址最好为英文网站,因为这里面的分词不支持中文。所以中文分词是接下来研究的一个重点内容