数据库与索引结构

由文章标题可知  我们要建立数据库和索引。

一,定义Product类    
此类相当于MVC中的容器装载了数据库和索引所需要的对象,例如:category、name、type、content、summary、imageURI、originalRrl、updatedtime。顺序没关系,代码如下:

package com.luceneheritrixbook.core;

public class Product {
	private String category=null;
	private String name=null;
	private String type=null;
	private String content=null;
	private String summary=null;
	private String imageURI=null;
	private String updatedtime=null;
	private String originalUrl=null;
	public String getCategory() {
		return category;
	}
	public void setCategory(String category) {
		this.category = category;
	}
	public String getContent() {
		return content;
	}
	public void setContent(String content) {
		this.content = content;
	}
	public String getImageURI() {
		return imageURI;
	}
	public void setImageURI(String imageURI) {
		this.imageURI = imageURI;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public String getOriginalUrl() {
		return originalUrl;
	}
	public void setOriginalUrl(String originalUrl) {
		this.originalUrl = originalUrl;
	}
	public String getSummary() {
		return summary;
	}
	public void setSummary(String summary) {
		this.summary = summary;
	}
	public String getType() {
		return type;
	}
	public void setType(String type) {
		this.type = type;
	}
	public String getUpdatetime() {
		return updatedtime;
	}
	public void setUpdatetime(String updatetime) {
		this.updatedtime = updatetime;
	}

}

二:定义Lucene的Document格式(即用于搜索的field域)

package com.luceneheritrixbook.index;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import com.luceneheritrixbook.core.Product;

public class ProductDocument {
	private static final String PRODUCT_ID="productid";
	private static final String INDEX_TIME="indextime";
	private static final String PRODUCT_URL="productrul";
	private static final String CATEGORY="category";
	private static final String PRODUCT_NAME="name";
	private static final String PRODUCT_TYPE="type";
	
	public static Document buildProductDocument(Product product,int id)
	{
		Document doc=new Document();
		
		Field identifier=new Field(PRODUCT_ID,id+"",Field.Store.YES,
				Field.Index.UN_TOKENIZED);
		
		long mills=System.currentTimeMillis();
		Field indextime=new Field(INDEX_TIME,mills+"",Field.Store.YES,
				Field.Index.UN_TOKENIZED);
		
		Field producturl=new Field(PRODUCT_URL,product.getOriginalUrl(),Field.Store.YES,
				Field.Index.UN_TOKENIZED);
		
		Field category=new Field(CATEGORY,product.getCategory(),Field.Store.YES,
				Field.Index.TOKENIZED);
		
		Field name=new Field(PRODUCT_NAME,product.getName(),Field.Store.YES,
				Field.Index.TOKENIZED);
		
		Field type=new Field(PRODUCT_TYPE,product.getType(),Field.Store.YES,
				Field.Index.TOKENIZED);
		
		String text=product.getCategory();
		text+=" "+product.getName();
		text+=" "+product.getType();
		Field all=new Field(PRODUCT_ID,text,Field.Store.YES,
				Field.Index.TOKENIZED);
		
		doc.add(identifier);
		doc.add(indextime);
		doc.add(producturl);
		doc.add(category);
		doc.add(name);
		doc.add(type);
		doc.add(all);
		
		return doc;
	}

}

 

三、对数据库进行操作(即向数据库中插入获得的product对象)

package com.luceneheritrixbook.database;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;

import com.luceneheritrixbook.core.Product;

public class ProductJDBC {
	private Connection con = null;

	private Statement stmt = null;

	private ResultSet rs = null;

	private PreparedStatement pstmt = null;

	private boolean autoCommit = true;

	public ProductJDBC(String url, String usr, String pwd) throws Exception 
	{
		Class.forName("com.mysql.jdbc.Driver").newInstance();
		con = DriverManager.getConnection(url, usr, pwd);
		
		con.setAutoCommit(autoCommit);
	}

	public int addProduct(Product p) throws Exception 
	{

		int nextid = getNextId();

		if (nextid < 0) {
			throw new Exception("Can&apos;t get next id.");
		}
		
		String content=p.getContent();
		String summary=p.getSummary();
		String imageURI=p.getImageURI();
		String originalUrl=p.getOriginalUrl();
		String category=p.getCategory();
		String name=p.getName();
		String type=p.getType();
		String updatetime=p.getUpdatetime();
		
		String expr="insert into product(content,abstractcontent,url," +
				"imageurl,category,name,type,updatedtime)values(?,?,?,?,?,?,?,?)";
		
		pstmt=con.prepareStatement(expr);
		
		pstmt.setString(1, content);
		pstmt.setString(2, summary);
		pstmt.setString(3, originalUrl);
		pstmt.setString(4, imageURI);
		pstmt.setString(5, category);
		pstmt.setString(6, name);
		pstmt.setString(7, type);
		pstmt.setString(8, updatetime);
		
		pstmt.execute();
		
		return nextid;
	}
	
	private int getNextId() throws Exception {

		int result = -1;

		String sql = "select max(id)+1 from product";

		stmt = con.createStatement();
		rs = stmt.executeQuery(sql);

		while (rs.next()) {
			result = rs.getInt(1);
		}

		return result;
	}
	
	public void close()
	{
		if(con!=null)
		{
			try
			{
				con.close();
			}
			catch(Exception e)
			{
				e.printStackTrace();
			}
			finally
			{
				con=null;
			}
		}
	}
}
/*
 * 在这里我发现了一个不好的地方,那就是完全相同的两个产品信息可以同时存入数据库
 * */

四、对索引进行操作(其实就是把前面所构建的词库加入JE分词,然后连同Document一起加入索引器)

package com.luceneheritrixbook.index;

import java.io.FileReader;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;

import com.luceneheritrixbook.core.Product;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;

public class ProductIndexer
{
     private String indexPath="";
     private IndexWriter writer=null;
     private Analyzer analyzer=null;
     private String dictionary_file=PropertyConfiguration.getWordDictionary();
     
     public ProductIndexer(String indexPath)throws Exception
     {
    	 this.indexPath=indexPath;
    	 initialize();
     }
     
     private void initialize() throws Exception
     {
    	 analyzer=new MMAnalyzer();
    	 FileReader reader=new FileReader(dictionary_file);
    	 ((MMAnalyzer)analyzer).addDictionary(reader);
    	 writer=new IndexWriter(indexPath,analyzer,true);
     }
     
     public void close()
     {
    	 try
    	 {
    		 writer.close();
    	 }
    	 catch(Exception e)
    	 {
    		 e.printStackTrace();
    		 writer=null;
    	 }
     }
     
     public void addProduct(Product product,int id)throws Exception
     {
    	 writer.addDocument(ProductDocument.buildProductDocument(product,id));
     }
     
     //优化索引
     public void optimizeIndex()throws Exception
     {
    	 writer.optimize();
     }
}

五、调用数据库处理类和索引处理类(这是建立数据库和索引最主要的类,主要过程是这样的:首先初始化数据库和索引的实例,然后是从heritix中读取的镜像网页,通过File的循环遍历从中读取每一个product的详细信息,然后生成一个Product对象,这样通过参数product就可以把数据存入数据库和索引了)代码如下:

package com.luceneheritrixbook.core;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;

import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;

public class ProductTextFileProcessor
{
	/**
	 * @param args
	 */
	private String[] directionaries;
	
	private static final String dbUrl=PropertyConfiguration.getDBUrl();
	private static final String dbUsr=PropertyConfiguration.getDBUsr();
	private static final String dbPwd=PropertyConfiguration.getDBPwd();
	private static final String indexPath=PropertyConfiguration.getIndexStorePath();
	
	private ProductJDBC productJDBC=null;
	private ProductIndexer indexer=null;
	
	public final static int SUMMARY_LENGTH=80;//内容简介的最大数量
	
	public ProductTextFileProcessor()
	{
		initialize();
	}
	
	public void initialize()
	{
		try
		{
			productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
			indexer=new ProductIndexer(indexPath);
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}	
	}
	
	public void setDirectionaries(String[] directionaries)
	{
		this.directionaries=directionaries;
	}
	
	protected void process()throws Exception
	{
		if(productJDBC==null)
		{
			throw new Exception("Database connection failed,pls retry");
		}
		
		if(indexer==null)
		{
			throw new Exception("Lucene index failed,pls retry");
		}
		
		if(directionaries==null||directionaries.length==0)
		{
			System.out.print("失败了");
			return;
		}
		
		try
		{
			for(int i=0;iSUMMARY_LENGTH)
			{
				p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
			}
			else
				p.setSummary(contentstr);

			p.setUpdatetime(updatetime);
			//以上一个product对象已存在
			//先存入数据库,然后h获得返回的id值;
			int nextid=insert2DB(p);//这里出现了错误,其实还是ProductJDBC.java里面出现了错误
			
			//用刚返回的id值,向索引中加入Product对象
			buildIndex(p,nextid);	
		}
		//索引优化
		 optimizeindex();
		 /*这只是一个函数,不能直接用来优化索引,不知道为什么
		  * 不直接用indexer.optimizeIndex();
		  */
	}
	
	protected int insert2DB(Product p)throws Exception
	{
		return productJDBC.addProduct(p);
	}
	
	protected void buildIndex(Product p,int nextid)throws Exception
	{
		indexer.addProduct(p,nextid);
	}
	//优化所以你
	private void optimizeindex()throws Exception
	{
		indexer.optimizeIndex();
	}
	
	private void closeIndex()throws Exception
	{
		indexer.close();
	}
	
	private void closeDB()
	{
		productJDBC.close();
	}
	
 /*	public String getDbPwd()
	{
		return dbPwd;
	}
	
	public String getDbUrl()
	{
		return dbUrl;
	}
	
	public String getDbUsr()
	{
		return dbUsr;
	}
	
	public String getIndexPath()
	{
		return indexPath;
	}*/
	//上述方法书上有,但我看来看去,发现它好像也没什么用,就暂时给冻结了,好像也没报错。
	
	public static void main(String[] args) throws Exception
	{
		// TODO Auto-generated method stub
		ProductTextFileProcessor pro=new ProductTextFileProcessor();
		pro.initialize();//前面已经有了,不知道是不是多此一举。
		
		String path1="c://product//mobile//";
		pro.setDirectionaries(new String[]{path1});//这句到底是什么意思
        
		pro.process();
	}
}

数据库还好说,以后肯定要用到,可是这个索引到底有什么用啊,好像后面没用到,不过我猜肯定是我弄</STRONG>错了,怎么可能会用不到,开玩笑嘛,等着看吧  ……

注:先第五发现有错误,修改如下:

  1. package com.luceneheritrixbook.core;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileReader;
  5. import com.luceneheritrixbook.database.ProductJDBC;
  6. import com.luceneheritrixbook.extractor.Extractor;
  7. import com.luceneheritrixbook.index.ProductIndexer;
  8. import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
  9. public class ProductTextFileProcessor
  10. {
  11.     /**
  12.      * @param args
  13.      */
  14.     private String[] directionaries;
  15.     
  16.     private static final String dbUrl=PropertyConfiguration.getDBUrl();
  17.     private static final String dbUsr=PropertyConfiguration.getDBUsr();
  18.     private static final String dbPwd=PropertyConfiguration.getDBPwd();
  19.     private static final String indexPath=PropertyConfiguration.getIndexStorePath();
  20.     
  21.     private ProductJDBC productJDBC=null;
  22.     private ProductIndexer indexer=null;
  23.     
  24.     public final static int SUMMARY_LENGTH=80;//到底有什么用呢
  25.     
  26.     public ProductTextFileProcessor()
  27.     {
  28.         initialize();
  29.     }
  30.     
  31.     public void initialize()
  32.     {
  33.         try
  34.         {
  35.             productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
  36.             indexer=new ProductIndexer(indexPath);
  37.         }
  38.         catch(Exception e)
  39.         {
  40.             e.printStackTrace();
  41.         }   
  42.     }
  43.     
  44.     public void setDirectionaries(String[] directionaries)
  45.     {
  46.         this.directionaries=directionaries;
  47.     }
  48.     
  49.     protected void process()throws Exception
  50.     {
  51.         if(productJDBC==null)
  52.         {
  53.             throw new Exception("Database connection failed,pls retry");
  54.         }
  55.         
  56.         if(indexer==null)
  57.         {
  58.             throw new Exception("Lucene index failed,pls retry");
  59.         }
  60.         
  61.         if(directionaries==null||directionaries.length==0)
  62.         {
  63.             System.out.print("失败了");
  64.             return;
  65.         }
  66.         
  67.         try
  68.         {
  69.             for(int i=0;i<directionaries.length;i++)
  70.             {
  71.                 File f=new File(directionaries[i]);
  72.                 traverse(f);
  73.             }
  74.             //处理完成后关闭数据库
  75.             closeDB();
  76.             
  77.             //处理完成后关闭索引器
  78.             closeIndex();
  79.         }
  80.         catch(Exception e)
  81.         {
  82.             e.printStackTrace();
  83.         }
  84.     }
  85.     
  86.     protected void traverse(File file)throws Exception
  87.     {
  88.         String[] files=file.list();
  89.         for(int i=0;i<files.length;i++)
  90.         {
  91.             File productfile=new File(file,files[i]);
  92.             
  93.             String fname=productfile.getName();
  94.             System.out.println(fname);
  95.             
  96.             BufferedReader reader=new BufferedReader(new FileReader(productfile));
  97.             
  98.             String url=reader.readLine();
  99.             String name=reader.readLine();
  100.             String type=reader.readLine();
  101.             String imageURI="";
  102.             String updatetime=fname.substring(fname.lastIndexOf("-")+1,fname.lastIndexOf("."));
  103.             
  104.             StringBuffer content=new StringBuffer();
  105.             String line=reader.readLine();
  106.             while(line!=null&&!line.equals(Extractor.SEPARATOR))//&&!line.equals(Extractor.SEPARATOR) 难道调用了前面的东西
  107.             {
  108.                 content.append(line).append("/r/n");
  109.                 line=reader.readLine();
  110.             }
  111.             
  112.             imageURI=reader.readLine();
  113.             
  114.             //生成并设置"一个"product对象
  115.             Product p=new Product();
  116.             p.setCategory("手机");
  117.             p.setName(name);
  118.             p.setType(type);
  119.             p.setImageURI(imageURI);
  120.             //p.setContent(content);//为什么会出错呢?
  121.             p.setOriginalUrl(url);
  122.             
  123.             String contentstr=content.toString();
  124.             p.setContent(contentstr);
  125.             
  126.             if(contentstr.length()>SUMMARY_LENGTH)
  127.             {
  128.                 p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
  129.             }
  130.             else
  131.                 p.setSummary(contentstr);
  132.             p.setUpdatetime(updatetime);
  133.             //以上一个product对象已存在
  134.             //先存入数据库,然后h获得返回的id值;
  135.             int nextid=insert2DB(p);//这里出现了错误,其实还是ProductJDBC.java里面出现了错误
  136.             
  137.             //用刚返回的id值,向索引中加入Product对象
  138.             buildIndex(p,nextid);   
  139.         }
  140.         //索引优化
  141.          optimizeindex();
  142.          /*这只是一个函数,不能直接用来优化索引,不知道为什么
  143.           * 不直接用indexer.optimizeIndex();
  144.           */
  145.     }
  146.     
  147.     protected int insert2DB(Product p)throws Exception
  148.     {
  149.         return productJDBC.addProduct(p);
  150.     }
  151.     
  152.     protected void buildIndex(Product p,int nextid)throws Exception
  153.     {
  154.         indexer.addProduct(p,nextid);
  155.     }
  156.     //优化所以你
  157.     private void optimizeindex()throws Exception
  158.     {
  159.         indexer.optimizeIndex();
  160.     }
  161.     
  162.     private void closeIndex()throws Exception
  163.     {
  164.         indexer.close();
  165.     }
  166.     
  167.     private void closeDB()
  168.     {
  169.         productJDBC.close();
  170.     }
  171.     
  172.  /* public String getDbPwd()
  173.     {
  174.         return dbPwd;
  175.     }
  176.     
  177.     public String getDbUrl()
  178.     {
  179.         return dbUrl;
  180.     }
  181.     
  182.     public String getDbUsr()
  183.     {
  184.         return dbUsr;
  185.     }
  186.     
  187.     public String getIndexPath()
  188.     {
  189.         return indexPath;
  190.     }*/
  191.     //上述方法书上有,但我看来看去,发现它好像也没什么用,就暂时给冻结了,好像也没报错。
  192.     
  193.     public static void main(String[] args) throws Exception
  194.     {
  195.         // TODO Auto-generated method stub
  196.         ProductTextFileProcessor pro=new ProductTextFileProcessor();
  197.         pro.initialize();//前面已经有了,不知道是不是多此一举。
  198.         
  199.         String path1="c://product//mobile//";
  200.         pro.setDirectionaries(new String[]{path1});//这句到底是什么意思
  201.         
  202.         pro.process();
  203.     }
  204. }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值