┳
┣com.searchengine.entity.ContentObject.java (内容对象)
┣com.searchengine.util.WebContent.java(取得网页内容)
┗com.searchengine.util.PackContentObject(对象转换)
ContentObject.java代码
/**
* Class Name: ContentObject.java 搜索引擎内容对象
* @author JLC
* @version 1.0
*/
public class ContentObject {
/**
* 内容标题
*/
private String title;
/**
* 内容详细信息
*/
private String content;
/**
* 内容创建时间
*/
private Date createDate = new Date();
............ get set方法就不列出了 请自行添加
}
PackContentObject.java代码
/**
* 不同对象之间转换
* @author JLC
*/
public class PackContentObject {
/**
* 转换内容对象为Document对象
* @param ct
* @return
*/
public static Document convertContentToDoc(ContentObject ct){
Document doc = new Document();
//Field.Store.YES 表示存储内容
doc.add(new StringField("title", ct.getTitle(), Field.Store.YES));
doc.add(new TextField("content", ct.getContent(),Field.Store.YES));
doc.add(new LongField("createDate",new Date().getTime(),Field.Store.YES));
return doc;
}
/**
* 转换Documnet对象为内容对象
* @param doc
* @return
*/
public static ContentObject convertDocToContent(Document doc){
ContentObject ct = new ContentObject();
ct.setTitle(doc.get("title"));
ct.setContent(doc.get("content"));
Date dt =new Date(Long.parseLong(doc.get("createDate")));
ct.setCreateDate(dt);
return ct;
}
}
WebContent.java代码
public class WebContent {
/**
* 读取一个网页全部内容
*/
public String getOneHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "gbk"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (final MalformedURLException me) {
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
*
* @param s
* @return 获得网页标题
*/
public String getTitle(final String s) {
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<title>.*?</title>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return outTag(title);
}
/**
*
* @param s
* @return 获得链接
*/
public List<String> getLink(final String s) {
String regex;
final List<String> list = new ArrayList<String>();
regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/**
*
* @param s
* @return 获得脚本代码
*/
public List<String> getScript(final String s) {
String regex;
final List<String> list = new ArrayList<String>();
regex = "<script.*?</script>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/**
*
* @param s
* @return 获得CSS
*/
public List<String> getCSS(final String s) {
String regex;
final List<String> list = new ArrayList<String>();
regex = "<style.*?</style>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/**
*
* @param s
* @return 去掉标记
*/
public String outTag(final String s) {
return s.replaceAll("<.*?>", "");
}
/**
* Function:抓取网页内容
* @author JLC
* @param url
* @return
*/
public ContentObject getContentFromSite(String url){
ContentObject cobj= new ContentObject();
String html = "";
try {
html = getOneHtml(url);
String title = outTag(getTitle(html));
html = html.replaceAll("(<br>)+?", "\n");// 转化换行
html = html.replaceAll("<p><em>.*?</em></p>", "");
cobj.setTitle(title);
cobj.setContent(outTag(html));
} catch (final Exception e) {}
return cobj;
}
}
mainTest.java测试代码
public class mainTest {
/**
* Function:抓取网页内容
* @author JLC
* @return
*/
public static List<Document> getWebContentDocuments(){
List<Document> docs = new ArrayList<Document>();
WebContent wc = new WebContent();
Document doc1 = PackContentObject.convertContentToDoc(wc.getContentFromSite("http://news.163.com"));
docs.add(doc1);
Document doc2 = PackContentObject.convertContentToDoc(wc.getContentFromSite("http://news.sohu.com/"));
docs.add(doc2);
return docs;
}
/**
* 创建索引
*/
public static void createSearchEngineData(){
SearchEngineCore se = SearchObject.getInstance().getLuceneContext("search");
List<Document> docList = getWebContentDocuments();
for(Document doc:docList){
try{
se.getTw().addDocument(doc);
se.commitIndex();
se.refreshData();
}catch(Exception e){
e.printStackTrace();
}
}
}
public static void main(String args[]){
//创建索引数据
createSearchEngineData();
}
}
运行Main方法后 可以看到索引已经创建完成