搜索引擎的预处理

最新推荐文章于 2022-02-28 22:20:57 发布

杨鑫newlfe

最新推荐文章于 2022-02-28 22:20:57 发布

阅读量1.6k

点赞数

分类专栏： Java 搜索引擎

本文链接：https://blog.csdn.net/u012965373/article/details/39099053

版权

Java 同时被 2 个专栏收录

427 篇文章 33 订阅

订阅专栏

搜索引擎

131 篇文章 2 订阅

订阅专栏

这是我接触搜索引擎以来第一次自己敲代码，这个代码是本书中的一个预处理类。

自己到现在才理解。这是最后一次改的时间，希望自己可以坚持着把这本书，边看，边学会，也分享给大家。

这里是各位需要的java代码：

package ch2.lucenedemo.preprocess;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;

/**
* 类名：FilePreprocess 方法：功能：完成使用lucene进行索引前的预处理，即：半角全角转化、大文件切分；
*
* @author MzyAiLqq
*
*/

public class FilePreprocess {
/**
* 参数：File file被处理的源文件 String outputDir处理后的文件输出路径
*/
public static void preprocess(File file, String outputDir) {
try {
splitToSmallFiles(charactorProcess(file, outputDir + "output.all"),
outputDir);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

public static File charactorProcess(File file, String destFile)
throws Exception {
// 创建一个输出流，用于写新文件
BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));

// 创建一个输入流，用于读取文件
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while (line != null) {
if (!line.equals("\r\n")) {
// 调用replace方法替换所有全角字符
String newline = replace(line);

// 将替换后的String写入新文件
writer.write(newline);
// 写入行分隔符
writer.newLine();
}
line = reader.readLine();
}
// 关闭输入输出流，将缓冲区数据写入文件
reader.close();
writer.close();
return new File(destFile);
}

/**
* 方法名：replace 参数：String line传入需要处理的行字符串功能：全角半角转换
*
* @author MzyAiLqq
*
*/
private static String replace(String line) {
// 创建一个HashMap用来存储全角字符和半角字符的对应关系
// 每个entry中的key为全角字符，value为半角字符
HashMap<String, String> map = new HashMap<String, String>();
map.put("，", ",");
map.put("。", ".");
map.put("〈", "<");
map.put("〉", ">");
map.put("｜", "|");
map.put("《", "<");
map.put("》", ">");
map.put("［", "[");
map.put("］", "]");
map.put("？", "?");
map.put("＂", "\"");
map.put("：", ":");
map.put("﹑", ",");
map.put("（", "(");
map.put("）", ")");
map.put("【", "[");
map.put("】", "]");
map.put("－", "-");
map.put("￣", "~");
map.put("！", "!");
map.put("｀", "`");
map.put("１", "1");
map.put("２", "2");
map.put("３", "3");
map.put("４", "4");
map.put("５", "5");
map.put("６", "6");
map.put("７", "7");
map.put("８", "8");
map.put("９", "9");
int length = line.length();
for (int i = 0; i < length; i++) {
// 每次截取一个字符进行判断
String charat = line.substring(i, i + 1);
if (map.get(charat) != null) {
line = line.replace(charat, (String) map.get(charat));
}
}
// 返回转换后的字符行
return line;
}

public static void splitToSmallFiles(File file, String outputpath)
throws Exception {
// 文件计数器，用来产生文件名
int filePointer = 0;

// 定义单个文件的最大长度
final int MAX_SIZE = 10240;

// 创建文件输出、输入流
BufferedWriter writer = null;
BufferedReader reader = new BufferedReader(new FileReader(file));

// 建立字符串缓冲区，存储大文件中读取的数据
StringBuffer buffer = new StringBuffer();

String line = reader.readLine();
// 循环遍历读取的每行字符串
while (line != null) {
// 如果读取的字符串不为空，则将字符串加入缓冲区并在末尾加上回车换行
buffer.append(line).append("\r\n");

// 判断缓冲区长度是否达到定义的单个文件最大长度
if (buffer.toString().getBytes().length >= MAX_SIZE) {
// 如果打到最大长度，则将缓冲区的数据写入文件
// filePointer是文件名前缀的一部分
writer = new BufferedWriter(new FileWriter(outputpath
+ "output" + filePointer + ".txt"));
writer.write(buffer.toString());
writer.close();

// 文件计数器自加1
filePointer++;

// 清空StringBuffer中的数据
buffer = new StringBuffer();
}
// 如果没有达到最大长度，则继续读取下一行
line = reader.readLine();
System.out.println(filePointer);
}
// 如果大文件已经读取完毕，直接将缓冲区的数据写入文件
writer = new BufferedWriter(new FileWriter(outputpath + "output"
+ filePointer + ".txt"));
writer.write(buffer.toString());
reader.close();
writer.close();
}

public static void main(String[] args){
//设置需要被处理的源文件位置
String inputFile = "E:\\book.txt";

//设置被处理后的文件存放位置
String outputDir = "E:\\testfolder\\";

//判断处理后文件存放是否存在，如果不存在则创建文件夹
if(!new File(outputDir).exists())
new File(outputDir).mkdirs();

//创建一个FilePreprocess类，并调用preprocess方法进行预处理
FilePreprocess filePreprocess = new FilePreprocess();
filePreprocess.preprocess(new File(inputFile), outputDir);
}
}