lucene 分词解析器 将商品名全部切成各种词方便匹配

1、附件附上中文解析器的相关包

2、代码

package com.yihaodian.pricehisotry;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.apache.lucene.demo.IndexFiles;
import java.util.regex.Matcher;
import java.util.regex.Pattern;



import com.yihaodian.pricehisotry.service.ProductService;


@SuppressWarnings("deprecation")
public class CutWords {
	
private static ApplicationContext context = null;
	
	static{
		context = new ClassPathXmlApplicationContext(
				new String[] { "/spring-bean.xml", "/spring-dao.xml" });	
	}
	private static Map<String,String> brand = new HashMap<String,String>();
	private static Map<String,String> generalName = new HashMap<String,String>();
	
 	private static Set<String> name1 = new HashSet<String>();
	private static Set<String> name2 = new HashSet<String>();
	private static Set<String> name3 = new HashSet<String>();
	private static Set<String> name4 = new HashSet<String>();
	private static Set<String> name5 = new HashSet<String>();
	private static Set<String> name6 = new HashSet<String>();
	
	private static long indexWord = 0;
	//存储大于6个的
	private static Set<String> name7 = new HashSet<String>();
	
	
	public static void main(String[] args) throws IOException {
		
		//读取已有的special word
		 File file = new File("D:\\eee.txt");
	        try {
	            FileInputStream fileInputStream = new FileInputStream(file);  
	            InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "GBK");  
	            BufferedReader br = new BufferedReader(inputStreamReader); 
	            String tempString = null;
	            // 一次读入一行,直到读入null为文件结束
	            while ((tempString = br.readLine()) != null) {
	               if(tempString.length() == 1) name1.add(tempString);
	               else if(tempString.length() == 2) name2.add(tempString);
	               else if(tempString.length() == 3) name3.add(tempString);
	               else if(tempString.length() == 4) name4.add(tempString);
	               else if(tempString.length() == 5) name5.add(tempString);
	               else if(tempString.length() == 6) name6.add(tempString);
	               else name7.add(tempString);
	                	
	            }
	            br.close();
	        }catch (Exception e) {
			}
		
	    //读取已有的BAND
	        File fileBrand = new File("D:\\band.txt");
	        try {
	            FileInputStream fileInputStream = new FileInputStream(fileBrand);  
	            InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "GBK");  
	            BufferedReader br = new BufferedReader(inputStreamReader); 
	            String tempString = null;
	            // 一次读入一行,直到读入null为文件结束
	            while ((tempString = br.readLine()) != null) {
	            	if (!brand.containsKey(tempString)) {
	            		brand.put(tempString, tempString);        	
					}
	            }
	            br.close();
	        }catch (Exception e) {
	        	e.printStackTrace();
			}
		   
	        //读取已有的general word
	        File filegeneral = new File("D:\\geneName.txt");
	        try {
	            FileInputStream fileInputStream = new FileInputStream(filegeneral);  
	            InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "GBK");  
	            BufferedReader br = new BufferedReader(inputStreamReader); 
	            String tempString = null;
	            // 一次读入一行,直到读入null为文件结束
	            while ((tempString = br.readLine()) != null) {
	            	//这里没有被执行
	            	if (!generalName.containsKey(tempString)) {
	            		generalName.put(tempString,tempString);        	
					}
	            }
	            br.close();
	        }catch (Exception e) {
	        	e.printStackTrace();
			}
		
	        System.out.println("得到的品牌的大小:"+brand.size());
	        System.out.println("得到的名词的大小:"+generalName.size());
	        
		ProductService productService = (ProductService)(context.getBean("productService"));
		List<String> productNames = productService.queryYhdProductsByCategoryName("%巧克力%");
		
		File f = new File("D:\\liufen.txt");
		if(!f.exists())
			f.createNewFile();
		FileWriter fw = new FileWriter(f,true);
//		for(int k=0;k<carInfo.size();k++){
//			fw.write(carInfo.get(k).getCar().get(6)+"\r\n");
//		}
		
		Analyzer ikAnalyzer = new IKAnalyzer();
		System.out.println("======中文=======IKAnalyzer======分词=======");
		
		for (String productName : productNames) {
			showToken(ikAnalyzer, productName);
		}
		for (String name : name7) {
			fw.write(name+"\r\n");
		}
		for (String name : name6) {
			fw.write(name+"\r\n");
		}
		for (String name : name5) {
			fw.write(name+"\r\n");
		}
		for (String name : name4) {
			fw.write(name+"\r\n");
		}
		for (String name : name3) {
			fw.write(name+"\r\n");
		}
		for (String name : name2) {
			fw.write(name+"\r\n");
		}
		for (String name : name1) {
			fw.write(name+"\r\n");
		}
		fw.flush();
		fw.close();
		
//		Analyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_30);
//		System.out.println("=====一元========StandardAnalyzer=====分词========");
//		showToken(standardAnalyzer, text);
	}
	
	/**
	 * 分词及打印分词结果的方法
	 * @param analyzer     分词器名称
	 * @param text         要分词的字符串
	 * @throws IOException 抛出的异常
	 */
	public static void showToken(Analyzer analyzer, String text) throws IOException {
		
		Reader reader = new StringReader(text);
		TokenStream stream = (TokenStream)analyzer.tokenStream("", reader);
		//添加工具类  注意:以下这些与之前lucene2.x版本不同的地方
		TermAttribute termAtt  = (TermAttribute)stream.addAttribute(TermAttribute.class);
		OffsetAttribute offAtt  = (OffsetAttribute)stream.addAttribute(OffsetAttribute.class);
		// 循环打印出分词的结果,及分词出现的位置
		while(stream.incrementToken()){
			indexWord++;
//			System.out.println(termAtt.term() + "|("+ offAtt.startOffset() + " " + offAtt.endOffset()+")"); 
			Pattern pattern = Pattern.compile("\\d+[ml]{0,1}[l]{0,1}");
			Matcher matcher = pattern.matcher(termAtt.term());
			if (matcher.find()) {
				continue;
			}
			
			IndexFiles indexFiles; 
			
			if (brand.containsKey(termAtt.term()))  continue;
			
			if (generalName.containsKey(termAtt.term())) continue;
			
//			System.out.println(termAtt.term()+"--"+indexWord);
			int len = offAtt.endOffset() - offAtt.startOffset();
			switch (len) {
			case 1:
				name1.add(termAtt.term());
				break;
			case 2:
				name2.add(termAtt.term());
				break;
			case 3:
				name3.add(termAtt.term());
				break;
			case 4:
				name4.add(termAtt.term());
				break;
			case 5:
				name5.add(termAtt.term());
				break;
			case 6:
				name6.add(termAtt.term());
				break;
			default:
				name7.add(termAtt.term());
				break;
			}
		}
	}

}


 在本程序中,这段代码对于商品名中包含品牌或者是已经有了的切词去重复。然后将已有的切词和刚切的全部写入到一个文件中

 

lucene包采用的是3.5最新版本。官网可以下载。

 

在切词的过程中可能遇到,切到的词不是很令人满意,这个时候需要对词库做出整理。另外还可以对解析器做出优化,加强解析器只能识别词组的功能。丰富解析器的词库

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值