为了好写论文,也论文上好复制粘贴,我把我学习的过程相当于做笔记一样写在这里,一起共勉!!!
第一步,前期处理,也就是说如果文本(或者其他)过大,我们需要把它切分为更小的文本(或其他),切片代码如下:
package mych2_demo.lucenedemo.preprocess;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.HashMap;
import java.util.Map;
public class FilePreprocess {
/**
*
* 两参数,一个是被处理的源文件,一个是处理后的文件输出路径
*
*/
public static void preprocess(File file,String outputDir){
try{
splitToSmallFiles(charactorProcess(file,outputDir+"output.all"),outputDir);
}catch(Exception e){
e.printStackTrace();
}
}
/**
* 对文件字符进行全角/半角处理
*
*/
public static File charactorProcess(File file,String destFile ) throws Exception{
BufferedWriter writer= new BufferedWriter(new FileWriter(destFile));
BufferedReader reader= new BufferedReader(new FileReader(file));
String line = reader.readLine();
while(line!=null){
if(!line.equals("\r\n")){
String newline=replace(line);
writer.write(newline);
writer.newLine();
}
line=reader.readLine();
}
reader.close();
writer.close();
return new File(destFile);
}
/**
* 拆分成小文件
*/
public static void splitToSmallFiles(File file,String outputpath) throws Exception{
int filePointer=0;
int MAX_SIZE=10240;
BufferedWriter writer=null;
BufferedReader reader=new BufferedReader(new FileReader(file));
StringBuffer buffer =new StringBuffer();
String line = reader.readLine();
while(line!=null){
buffer.append(line).append("\r\n");
if(buffer.toString().getBytes().length>=MAX_SIZE){
writer=new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+".txt"));
writer.write(buffer.toString());
writer.close();
filePointer++;
buffer=new StringBuffer();
}
line = reader.readLine();
}
writer = new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+".txt"));
writer.write(buffer.toString());
writer.close();
}
/**
* 全角半角的转换
*/
private static String replace(String line){
Map map=new HashMap();
map.put(",", ",");
map.put("。", ".");
map.put("〈", "<");
map.put("〉", ">");
map.put("‖", "|");
map.put("《", "<");
map.put("》", ">");
map.put("〔", "[");
map.put("〕", "]");
map.put("﹖", "?");
map.put("?", "?");
map.put("“", "\"");
map.put("”", "\"");
map.put(":", ":");
map.put("、", ",");
map.put("(", "(");
map.put(")", ")");
map.put("【", "[");
map.put("】", "]");
map.put("—", "-");
map.put("~", "~");
map.put("!", "!");
map.put("‵", "'");
map.put("①", "1");
map.put("②", "2");
map.put("③", "3");
map.put("④", "4");
map.put("⑤", "5");
map.put("⑥", "6");
map.put("⑦", "7");
map.put("⑧", "8");
map.put("⑨", "9");
int length=line.length();
for(int i=0;i<length;i++){
String charat=line.substring(i,i+1);
if(map.get(charat)!=null){
line=line.replace(charat, (String)map.get(charat));
}
}
return line;
}
// public static void main(String[] args){
// String inputFile="f:\\book.txt";
// String outputDir="f:\\testfoler\\";
//
// if(!new File(outputDir).exists()){
// new File(outputDir).mkdirs();
// }
//
// FilePreprocess filepreprocess=new FilePreprocess();
// filepreprocess.preprocess(new File(inputFile), outputDir);
// }
}
在网上下的一部text小说:
切分后如下:
借助Lucene开源框架建立索引:
代码如下:
package mych2_demo.lucenedemo.process;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class IndexProcesser {
private String INDEX_STROE_PATH="f:\\index";
//创建索引
public void createIndex(String inputDir){
try{
IndexWriter writer=new IndexWriter(INDEX_STROE_PATH, new MMAnalyzer(), true);
File filesDir=new File(inputDir);
//取得所有需要建立索引的文件数组
File[] files = filesDir.listFiles();
//遍历数组
for(int i=0;i<files.length;i++){
String fileName=files[i].getName();
if(fileName.substring(fileName.lastIndexOf(".")).equals(".txt")){
Document doc =new Document();
//为文件名创建一个field
Field field =new Field("filename",files[i].getName(),Field.Store.YES,Field.Index.TOKENIZED);
doc.add(field);
//为文件内容创建一个field
field = new Field("content", loadFileToString(files[i]), Field.Store.NO, Field.Index.TOKENIZED);
doc.add(field);
//吧document加入IndexWriter
writer.addDocument(doc);
}
}
writer.close();
}catch (Exception e){
e.printStackTrace();
}
}
public String loadFileToString(File file) {
try{
BufferedReader br= new BufferedReader(new FileReader(file));
StringBuffer sb=new StringBuffer();
String line=br.readLine();
while(line!=null){
sb.append(line);
line=br.readLine();
}
br.close();
return sb.toString();
}catch(Exception e){
e.printStackTrace();
return null;
}
}
public static void main(String[] args){
IndexProcesser processer = new IndexProcesser();
processer.createIndex("f:\\testfolder");
}
}
测试所建立后生成的索引:
建立索引后,我们分别按照我们索引查询和我们java系统的String查询比较
代码如下:
package mych2_demo.lucenedemo.process;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.StringTokenizer;
import javax.rmi.CORBA.Tie;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
public class Search {
private String INDEX_STORE_PATH="f:\\index";
//利用Lucene的搜索
public void indexSearch(String searchType, String searchKey){
try{
System.out.println("****使用索引方式搜索**********");
System.out.println("----------------------------");
//根据索引位置建立索引IndexSearcher
IndexSearcher searcher=new IndexSearcher(INDEX_STORE_PATH);
//建立索引单元,searchType代表搜索的Filed,SearchKey代表关键字
Term t=new Term(searchType, searchKey);
//由term生成一个Query
Query q=new TermQuery(t);
//搜索开始时间
Date beginTime=new Date();
//获取一个<document,frequency>的枚举对象TermDocs
TermDocs termDocs=searcher.getIndexReader().termDocs(t);
while(termDocs.next()){
//输出在文档中出现关键词的次数
System.out.print("find "+termDocs.freq()+" matches in");
//输出搜索到关键词的文档
System.out.println(searcher.getIndexReader().document(termDocs.doc())
.getField("filename").stringValue());
}
//搜索完成时间
Date endTime=new Date();
//搜索所耗时间
long timeofSearch=endTime.getTime()-beginTime.getTime();
System.out.println("使用索引方式所花时间 "+timeofSearch+" ms");
}catch(Exception e){
e.printStackTrace();
}
}
/**
* 利用字符串的索引
*/
public void stringSearch(String keyword,String searchDir){
System.out.println("****利用字符串的索引**********");
System.out.println("----------------------------");
File filesDir=new File(searchDir);
File[] files=filesDir.listFiles();
Map rs = new LinkedHashMap();
Date beginTime=new Date();
for(int i=0;i<files.length;i++){
//初始化匹配次数
int hits=0;
try{
BufferedReader br=new BufferedReader(new FileReader(files[i]));
StringBuffer sb=new StringBuffer();
String line=br.readLine();
while(line!=null){
sb.append(line);
line=br.readLine();
}
br.close();
String stringToSearch=sb.toString();
//初始化fromIndex
int fromIndex=-keyword.length();
//逐个匹配关键词
while((fromIndex = stringToSearch.indexOf(keyword,fromIndex+keyword.length()))!=-1){
hits++;
}
//将文件名和匹配次数加入hashMap
rs.put(files[i].getName(),new Integer(hits));
}catch(Exception e){
e.printStackTrace();
}
}
Iterator it=rs.keySet().iterator();
while(it.hasNext()){
String fileName=(String)it.next();
Integer hits=(Integer)rs.get(fileName);
System.out.println("find "+hits.intValue()+" matches in"+fileName);
}
Date endTime=new Date();
long timeofSearch = endTime.getTime()-beginTime.getTime();
System.out.println("使用字符串匹配所耗用时间 "+timeofSearch+" ms");
}
}
结果的比较:
索引:
字符串:
比较可以看出建立索引的好处以及优势。