Lucene4.10.4实践 索引联合查询数据库实现查询更快



需求



实习期间领导要我使用Lucene(不使用solr)将一个同事所要查询的数据索引起来,以达到更快的查询速度。

同事索要查询的数据共存在于两张表中,一张是街道表,一张是区划表。映射关系是一个区划对应N个街道。搜索效率要达到1秒千次。


数据库操作

<strong>SELECT area_id, area_name, parent_id, parent_name, area_level, `status`, address
from area_code_table LEFT JOIN
	(SELECT City, District, GROUP_CONCAT(Address) as address 
	FROM table_address 
	GROUP BY City, District) as addr
ON area_code_table.area_name = addr.District 
AND area_code_table.parent_name = addr.City </strong>

除了使用LeftJoin外还使用了mysql的一个特殊的行转列的方法 GROUP_CONCAT 可以将联合查询后数据组合成由逗号拼装成的字符串,如原本为aa bb cc 的三行数据可以组合成aa,bb,cc 极大的方便查询


项目结构

使用了ibatis这种轻量级的框架,用maven组织项目结构。在下载jar包时很方便,但在打包时遇到了大麻烦。


创建索引主程序


<span style="font-size:14px;">package com.xmmy.areaindex.index;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.omg.CORBA.PUBLIC_MEMBER;
import org.apache.lucene.analysis.*;

import com.xmmy.areaindex.db.AreaDaoImpl;
import com.xmmy.areaindex.params.IndexParams;
import com.xmmy.areaindex.pojo.Area;
import com.xmmy.areaindex.util.DateUtil;

public class IndexCreate {

	public static void main(String[] args) {

	}
	
	public void createAreaIndex(List<Area> areaList) throws IOException {
		
		//存放索引的文件夹
		File indexFile = new File(IndexParams.indexAddressSavePath);
		//创建Directory对象
		Directory directory = FSDirectory.open(indexFile);
		//使用二元分词器
		//Analyzer cjkAnalyzer = new CJKAnalyzer();
		//使用一元分词
		Analyzer standardAnalyzer = new StandardAnalyzer();
		//创建IndexWriterConfig
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_4, standardAnalyzer);
		//IndexWriter的配置
		indexWriterConfig.setMaxBufferedDocs(10000);
		//创建IndexWriter
		IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
		
		int n = 0;
		
		//fieldtype构造
		FieldType fieldType = new FieldType();
		fieldType.setIndexed(true);	//存储
		fieldType.setStored(true);	//索引
		fieldType.setStoreTermVectors(true);
        fieldType.setTokenized(true);
        fieldType.setStoreTermVectorPositions(true);// 存储位置
        fieldType.setStoreTermVectorOffsets(true);// 存储偏移量
        Document doc = null;
		try {
			//System.out.println("areaList length = " + areaList.size());
			for (Area area : areaList) {
				
				//检查area对象是否有null 有的话替换
				checkAreaNull(area);
				
				//建立一个lucene文档
				doc = new Document();
				//添加文档索引
				doc.add(new Field("area_id", area.getArea_id() + "", fieldType));
				//设置area_name的权重分数
				Field area_name = new Field("area_name", area.getArea_name(), fieldType);
				area_name.setBoost(10);
				doc.add(area_name);
				doc.add(new Field("parent_id", area.getParent_id() + "", fieldType));
				doc.add(new Field("parent_name", area.getParent_name(), fieldType));
				doc.add(new Field("area_level", area.getArea_level() + "", fieldType));
				doc.add(new Field("status", area.getStatus() + "", fieldType));
				doc.add(new Field("address", area.getAddress(), fieldType));
				n++;
				indexWriter.addDocument(doc);
			}
			System.out.println("indexwriter = " + indexWriter.numDocs());
			
			
		} catch (Exception e) {
			// TODO: handle exception
			System.out.println(areaList.get(n).toString());
			e.printStackTrace();
		} finally {
			indexWriter.commit();
			indexWriter.forceMerge(1);
			indexWriter.close();
		}		
		
		
	}
	/**
	 * 判断area对象中是否存在null值
	 */
	public void checkAreaNull(Area area) {
		if (area.getArea_name() == null) {
			area.setArea_name("无area_name");
		}
		if (area.getParent_name() == null) {
			area.setParent_name("无parent_name");
		}
		if (area.getAddress() == null) {
			area.setAddress("无address");
		}
	}

}</span>


使用多线程分批创建索引,测试可以顺利索引160W条数据。


搜索索引


<span style="font-size:14px;">import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.xmmy.areaindex.params.IndexParams;
import com.xmmy.areaindex.params.SearchParams;
import com.xmmy.areaindex.pojo.Area;
import com.xmmy.areaindex.pojo.AreaCode;
import com.xmmy.areaindex.pojo.AreaIndex;
import com.xmmy.areaindex.util.GsonUtil;


public class SearchIndex {
	
	//结果最大返回书
	static Integer maxDocNum = 50;
	//要搜索的关键字
	String[] fields = new String[]{"area_name", "address"};
	//QueryParser类实例化
	QueryParser queryParser = new MultiFieldQueryParser(Version.LATEST, fields, new StandardAnalyzer());
	//IndexReader对象
	IndexReader indexReader = null;
	//IndexSearcher对象
	IndexSearcher indexSearcher = null;

	public SearchIndex() {
		// TODO Auto-generated constructor stub
		try {
			this.indexReader = DirectoryReader.open(FSDirectory.open(new File(IndexParams.indexOldAreaSavePath)));
			this.indexSearcher = new IndexSearcher(this.indexReader);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	protected List<AreaCode> search(String text) throws Exception {
		List<AreaCode> result = new ArrayList<AreaCode>();
		Query query = queryParser.parse(text);
		TopDocs topDocs = indexSearcher.search(query, maxDocNum);
		ScoreDoc[] scoreDocs = topDocs.scoreDocs;
		for (ScoreDoc scoreDoc : scoreDocs) {
			Document document = indexSearcher.doc(scoreDoc.doc);
			//结果数据对象
			AreaCode areaCode = new AreaCode();
			//组装对象
			//如果是从address中匹配来的 source标记为1 反之为0
			if (document.get("address").contains(text.replace("\"", ""))) {
				areaCode.setSource(1);
			}else {
				areaCode.setSource(0);
			}
			areaCode.setArea_id(Integer.parseInt(document.get("area_id")));
			areaCode.setArea_name(document.get("area_name"));
			areaCode.setParent_id(Integer.parseInt(document.get("parent_id")));
			areaCode.setParent_name(document.get("parent_name"));
			areaCode.setArea_level(Integer.parseInt(document.get("area_level")));
			if (areaCode.getSource() == 1) {
				areaCode.setFreq(getTF(document.get("address"), text));
			}else {
				areaCode.setFreq(1);
			}
			
			result.add(areaCode);
			//System.out.println(result.size());
			
		}
		return result;
		
	}
	
	public static Integer getTF(String address, String text) throws Exception {
		//出现次数 frequency
		//String text = "\"尖山街\"";
		int freq = 0;
		StringTokenizer st = new StringTokenizer(address, ",", false);
		while (st.hasMoreElements()) {
			if (st.nextToken().contains(text.replace("\"", ""))) {
				freq++;
			}
		}
		//System.out.println("freq = " + freq);
		return freq;
	}
	
	public List<AreaCode> searchByArea(String q) {
		List<AreaCode> result = null;
		//组装查询参数
		String query = "area_name:\"" + q + "\"";
		try {
			result = this.search(query);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			result = null;
		}
		return result;
	}
	
	public List<AreaCode> searchByAddress(String q){
		List<AreaCode> result = null;
		//组装参数
		String query = "address:\"" + q + "\"";
		try {
			result = this.search(query);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			result = null;
		}
		return result;
	}
	
	
	public static void main(String[] args) {
		
		//要搜索的关键词 加双引号才可以完全匹配
//		String text = "area_name:\"湖南省1\"";//这样表示只搜索area中有 湖南省 
		//text = "\"思明区\"";//这样搜索表示匹配area parent address中完整带"思明区"三个字的doc

		SearchIndex searchIndex = new SearchIndex();
		
		/**
		 * 千次查询测试时间
		 */
		ArrayList<String> line = new ArrayList<String>();
		try {
			InputStreamReader read = new InputStreamReader(new FileInputStream(new File("F:\\search.txt")));
			BufferedReader bf = new BufferedReader(read);
			String lineTxt = null;
			while ((lineTxt = bf.readLine()) != null) {
				line.add(lineTxt);
			}
			//System.out.println(line.size());
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		Iterator<String> it = line.iterator();
		
		long start = System.currentTimeMillis();
		String searchText = null;
		
		while (it.hasNext()) {
			searchText = "\"" + it.next() + "\"";
			try {
				//System.out.println(searchText);
				searchIndex.search(searchText);
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

		long end = System.currentTimeMillis();
		System.out.println("耗时: " + (end - start));
	}
	
}</span>



项目中有需要获得命中词频,但查阅lucene文档后并没有找到类似的接口。只能自己实现。


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值