1、附件附上中文解析器的相关包
2、代码
package com.yihaodian.pricehisotry;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.apache.lucene.demo.IndexFiles;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.yihaodian.pricehisotry.service.ProductService;
@SuppressWarnings("deprecation")
public class CutWords {
private static ApplicationContext context = null;
static{
context = new ClassPathXmlApplicationContext(
new String[] { "/spring-bean.xml", "/spring-dao.xml" });
}
private static Map<String,String> brand = new HashMap<String,String>();
private static Map<String,String> generalName = new HashMap<String,String>();
private static Set<String> name1 = new HashSet<String>();
private static Set<String> name2 = new HashSet<String>();
private static Set<String> name3 = new HashSet<String>();
private static Set<String> name4 = new HashSet<String>();
private static Set<String> name5 = new HashSet<String>();
private static Set<String> name6 = new HashSet<String>();
private static long indexWord = 0;
//存储大于6个的
private static Set<String> name7 = new HashSet<String>();
public static void main(String[] args) throws IOException {
//读取已有的special word
File file = new File("D:\\eee.txt");
try {
FileInputStream fileInputStream = new FileInputStream(file);
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "GBK");
BufferedReader br = new BufferedReader(inputStreamReader);
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = br.readLine()) != null) {
if(tempString.length() == 1) name1.add(tempString);
else if(tempString.length() == 2) name2.add(tempString);
else if(tempString.length() == 3) name3.add(tempString);
else if(tempString.length() == 4) name4.add(tempString);
else if(tempString.length() == 5) name5.add(tempString);
else if(tempString.length() == 6) name6.add(tempString);
else name7.add(tempString);
}
br.close();
}catch (Exception e) {
}
//读取已有的BAND
File fileBrand = new File("D:\\band.txt");
try {
FileInputStream fileInputStream = new FileInputStream(fileBrand);
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "GBK");
BufferedReader br = new BufferedReader(inputStreamReader);
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = br.readLine()) != null) {
if (!brand.containsKey(tempString)) {
brand.put(tempString, tempString);
}
}
br.close();
}catch (Exception e) {
e.printStackTrace();
}
//读取已有的general word
File filegeneral = new File("D:\\geneName.txt");
try {
FileInputStream fileInputStream = new FileInputStream(filegeneral);
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "GBK");
BufferedReader br = new BufferedReader(inputStreamReader);
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = br.readLine()) != null) {
//这里没有被执行
if (!generalName.containsKey(tempString)) {
generalName.put(tempString,tempString);
}
}
br.close();
}catch (Exception e) {
e.printStackTrace();
}
System.out.println("得到的品牌的大小:"+brand.size());
System.out.println("得到的名词的大小:"+generalName.size());
ProductService productService = (ProductService)(context.getBean("productService"));
List<String> productNames = productService.queryYhdProductsByCategoryName("%巧克力%");
File f = new File("D:\\liufen.txt");
if(!f.exists())
f.createNewFile();
FileWriter fw = new FileWriter(f,true);
// for(int k=0;k<carInfo.size();k++){
// fw.write(carInfo.get(k).getCar().get(6)+"\r\n");
// }
Analyzer ikAnalyzer = new IKAnalyzer();
System.out.println("======中文=======IKAnalyzer======分词=======");
for (String productName : productNames) {
showToken(ikAnalyzer, productName);
}
for (String name : name7) {
fw.write(name+"\r\n");
}
for (String name : name6) {
fw.write(name+"\r\n");
}
for (String name : name5) {
fw.write(name+"\r\n");
}
for (String name : name4) {
fw.write(name+"\r\n");
}
for (String name : name3) {
fw.write(name+"\r\n");
}
for (String name : name2) {
fw.write(name+"\r\n");
}
for (String name : name1) {
fw.write(name+"\r\n");
}
fw.flush();
fw.close();
// Analyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_30);
// System.out.println("=====一元========StandardAnalyzer=====分词========");
// showToken(standardAnalyzer, text);
}
/**
* 分词及打印分词结果的方法
* @param analyzer 分词器名称
* @param text 要分词的字符串
* @throws IOException 抛出的异常
*/
public static void showToken(Analyzer analyzer, String text) throws IOException {
Reader reader = new StringReader(text);
TokenStream stream = (TokenStream)analyzer.tokenStream("", reader);
//添加工具类 注意:以下这些与之前lucene2.x版本不同的地方
TermAttribute termAtt = (TermAttribute)stream.addAttribute(TermAttribute.class);
OffsetAttribute offAtt = (OffsetAttribute)stream.addAttribute(OffsetAttribute.class);
// 循环打印出分词的结果,及分词出现的位置
while(stream.incrementToken()){
indexWord++;
// System.out.println(termAtt.term() + "|("+ offAtt.startOffset() + " " + offAtt.endOffset()+")");
Pattern pattern = Pattern.compile("\\d+[ml]{0,1}[l]{0,1}");
Matcher matcher = pattern.matcher(termAtt.term());
if (matcher.find()) {
continue;
}
IndexFiles indexFiles;
if (brand.containsKey(termAtt.term())) continue;
if (generalName.containsKey(termAtt.term())) continue;
// System.out.println(termAtt.term()+"--"+indexWord);
int len = offAtt.endOffset() - offAtt.startOffset();
switch (len) {
case 1:
name1.add(termAtt.term());
break;
case 2:
name2.add(termAtt.term());
break;
case 3:
name3.add(termAtt.term());
break;
case 4:
name4.add(termAtt.term());
break;
case 5:
name5.add(termAtt.term());
break;
case 6:
name6.add(termAtt.term());
break;
default:
name7.add(termAtt.term());
break;
}
}
}
}
在本程序中,这段代码对于商品名中包含品牌或者是已经有了的切词去重复。然后将已有的切词和刚切的全部写入到一个文件中
lucene包采用的是3.5最新版本。官网可以下载。
在切词的过程中可能遇到,切到的词不是很令人满意,这个时候需要对词库做出整理。另外还可以对解析器做出优化,加强解析器只能识别词组的功能。丰富解析器的词库