1.提供自动学习数据,对这些数据进行索引
2。从索引里提取一个2维矩阵列表,写入一个文本文件里。关键性代码如下:
2。从索引里提取一个2维矩阵列表,写入一个文本文件里。关键性代码如下:
java 代码
- import java.io.File;
- import java.util.HashMap;
- import org.apache.log4j.Logger;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.index.TermEnum;
- import org.apache.lucene.search.BooleanQuery;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.BooleanClause.Occur;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import com.feedsky.classify.util.FileUtil;
- public class MMClassify{
- private static String indexDir = "D:\\projectsvn\\trunk\\classify\\indexFile";//索引目录
- private static String resultFile = "D:\\projectsvn\\trunk\\classify\\resultFile.txt";
- private IndexSearcher searcher ;
- private IndexReader reader;
- private Logger logger = Logger.getLogger(MMClassify.class);
- private HashMap clazzMap = new HashMap();
- private String[] clazzArray = {"C000007","C000008","C000010","C000013","C000014","C000016","C000020","C000022","C000023","C000024"};
- public static void main(String[] args) {
- // Matrix matrix = new Matrix(10,2);
- // matrix.set(0, 1, 0.5);
- // matrix.set(1, 1, 0.5);
- // // System.out.println(matrix);
- MMClassify mm = new MMClassify();
- mm.init();
- mm.process();
- // System.out.println(mm.matrix);
- }
- public void init(){
- try{
- Directory directory = FSDirectory.getDirectory(indexDir);
- reader = IndexReader.open(directory);
- searcher = new IndexSearcher(reader);
- clazzMap.put("C000007", "汽车");
- clazzMap.put("C000008", "财经");
- clazzMap.put("C000010", "IT");
- clazzMap.put("C000013", "健康");
- clazzMap.put("C000014", "体育");
- clazzMap.put("C000016", "旅游");
- clazzMap.put("C000020", "教育");
- clazzMap.put("C000022", "招聘");
- clazzMap.put("C000023", "文化");
- clazzMap.put("C000024", "军事");
- }catch(Exception e){
- e.printStackTrace();
- }
- }
- public void process(){
- try{
- //提取所有关键词的Term集合
- TermEnum terms = reader.terms();
- while (terms.next()) {
- //提取其中一个Term
- Term term = terms.term();
- String keyWord = term.text();
- //判断该关键词的所属域
- if(term.field().equals("itemContent")){
- BooleanQuery booleanQuery = new BooleanQuery();
- Query keywordQuery = new TermQuery(new Term("itemContent",keyWord));
- booleanQuery.add(keywordQuery,Occur.MUST);
- //到索引文件里做全部搜索
- Hits hits = searcher.search(keywordQuery);
- ResultList list = new ResultList(keyWord);
- for (int i = 0; i < hits.length(); i++) {
- Document doc = (Document)hits.doc(i);
- float score = hits.score(i);
- //doc.get("dirname");获得分类编号
- Result result = new Result(score,doc.get("dirname"));
- list.addResult(result);
- }
- HashMap map = list.resultMap;
- //提取二维矩阵表
- StringBuffer resultStr = new StringBuffer();
- for(int i=0;i
- String clazz = clazzArray[i];
- if(map.get(clazz) == null){
- resultStr.append(0 + " ");
- }else{
- // Float score = (Float)map.get(clazz);
- // resultStr.append(score.floatValue() + " ");
- resultStr.append(list.getValue(clazz) + " ");
- }
- }
- // System.out.println(resultStr.toString());
- Boolean isAppend = true;
- File file = new File(resultFile);
- if(!file.isFile()){
- isAppend = false;
- }
- String content = keyWord + " " + resultStr.toString();
- FileUtil.writeFileByLine(content , resultFile , "GBK" , isAppend);
- //提取该关键词在所有分类里的分值
- // Iterator it = map.keySet().iterator();
- // while (it.hasNext()) {
- // // Get key,分类编号
- // String key = (String)it.next();
- // Float score = (Float)map.get(key);
- //
- // //该关键词所在的分类以及分值
- // String outline = keyWord + " clazz:" + (String)clazzMap.get(key) + " score:" + score.floatValue();
- // System.out.println(outline);
- // logger.info(outline);
- //
- // }
- // matrix = new Matrix(hits.length(),2);
- }
- //FileUtil.writeFileByLine(out.toString(), "D:\\projectsvn\\trunk\\classify\\out.txt", "gbk", true);
- }
- }catch(Exception e){
- e.printStackTrace();
- }
- }
- }
- class Result {
- public Float score;
- public String clazz;
- public Result(float score, String clazz) {
- super();
- this.score = score;
- this.clazz = clazz;
- }
- }
- class ResultList{
- public HashMap resultMap = new HashMap();
- public String keyword ;
- public ResultList(String keyword) {
- super();
- this.keyword = keyword;
- }
- public void addResult(Result result){
- Object score = resultMap.get(result.clazz);
- if(score !=null ){
- float scorez =(Float)score;
- score = scorez + result.score;
- resultMap.put(result.clazz, score);
- }else{
- resultMap.put(result.clazz, result.score);
- }
- }
- public Float getValue(String clazz){
- Object score = resultMap.get(clazz);
- if(score!=null){
- float countScore = (float)0.000000001;
- Iterator it = resultMap.keySet().iterator();
- while (it.hasNext()) {
- // Get key
- Object key = it.next();
- Float sc = (Float)resultMap.get(key);
- countScore += sc;
- }
- float scoreAve=(Float)score/countScore;
- return scoreAve;
- }
- return (float)0;
- }
- public Float getHot(String clazz){
- Object score = resultMap.get(clazz);
- if(score!=null){
- return (Float)score;
- }
- return (float)0;
- }
- }