1 /**2 * Author: Orisun3 * Date: Sep 3, 20114 * FileName: FeatureSelect.java5 * Function: 读取word-doc矩阵,计算每个词的信息增益值,排序。输出IG最大的前300个特征项。6 */7 8 importjava.io.BufferedReader;9 importjava.io.BufferedWriter;10 importjava.io.File;11 importjava.io.FileNotFoundException;12 importjava.io.FileReader;13 importjava.io.FileWriter;14 importjava.util.ArrayList;15 importjava.util.Collections;16 importjava.util.Comparator;17 importjava.util.Iterator;18 importjava.util.Map;19 importjava.util.Map.Entry;20 21 importcom.sleepycat.bind.EntryBinding;22 importcom.sleepycat.bind.serial.SerialBinding;23 importcom.sleepycat.bind.serial.StoredClassCatalog;24 importcom.sleepycat.collections.StoredMap;25 importcom.sleepycat.je.Database;26 importcom.sleepycat.je.DatabaseConfig;27 importcom.sleepycat.je.DatabaseException;28 importcom.sleepycat.je.Environment;29 importcom.sleepycat.je.EnvironmentConfig;30 31 publicclassFeatureSelect {32 privateEnvironment env;33 protectedDatabase database;//用来存放url队列的数据库34 protectedDatabase catalogdatabase;//用来创建StoredClassCatalog实例的数据库35 privatestaticfinalString CLASS_CATALOG="java_class_catalog";//catalogdatabase的数据库名36 protectedStoredClassCatalog javaCatalog;//StoredClassCatalog实例用来序列化对象37 StoredMapFeaDB=null;38 39 publicFeatureSelect(String homeDirectory)throwsDatabaseException,40 FileNotFoundException {41 EnvironmentConfig envConfig=newEnvironmentConfig();//环境配置42 envConfig.setTransactional(true);//允许事务43 envConfig.setAllowCreate(true);//当环境配置不存在时就创建44 env=newEnvironment(newFile(homeDirectory), envConfig);//创建环境45 46 DatabaseConfig dbConfig0=newDatabaseConfig();//数据库配置47 dbConfig0.setTransactional(true);//允许事务48 dbConfig0.setAllowCreate(true);//当数据库不存在时就创建49 catalogdatabase=env.openDatabase(null, CLASS_CATALOG, dbConfig0);50 javaCatalog=newStoredClassCatalog(catalogdatabase);51 52 DatabaseConfig dbConfig=newDatabaseConfig();//数据库配置53 dbConfig.setTransactional(true);//允许事务54 dbConfig.setAllowCreate(true);//当数据库不存在时就创建55 database=env.openDatabase(null,"URL", dbConfig);//打开数据库56 57 EntryBindingkeyBinding=newSerialBinding(58 javaCatalog, String.class);59 EntryBindingvalueBinding=newSerialBinding(60 javaCatalog, Double.class);61 FeaDB=newStoredMap(database, keyBinding,62 valueBinding,true);63 }64 65 publicvoidclose()throwsDatabaseException {66 database.close();//关闭存放url的数据库67 javaCatalog.close();//关闭用来序列化对象的javaCatalog类68 env.close();//关闭环境69 }70 71 publicvoidcalIG(File matrixFile) {72 if(!matrixFile.exists()) {73 System.out.println("Matrix文件不存在.程序退出.");74 System.exit(2);75 }76 77 doubleentropy=Math.log(WordDocMatrix.clanum);78 try{79 FileReader fr=newFileReader(matrixFile);80 BufferedReader br=newBufferedReader(fr);81 String line=null;82 while((line=br.readLine())!=null) {83 String[] content=line.split("\\s+");84 String word=content[0];85 ArrayListal=newArrayList(WordDocMatrix.docnum);86 for(inti=0; i0) {99 wcount_class[i]++;100 }101 }102 wcount+=wcount_class[i];103 }104 pw=1.0*wcount/WordDocMatrix.docnum;105 for(inti=0; i>sort() {126 ArrayList>al=newArrayList>();127 //从数据库中读取数据128 if(!FeaDB.isEmpty()){129 Iterator>iter=FeaDB.entrySet().iterator();130 while(iter.hasNext()){131 Entryentry=iter.next();132 al.add(entry);133 }134 }135 Collections.sort(al,newComparator>() {136 publicintcompare(Map.Entryo1,137 Map.Entryo2) {138 doubleres=o2.getValue()-o1.getValue();139 if(res<0)140 return-1;141 elseif(res>0)142 return1;143 else144 return0;145 }146 });147 returnal;148 }149 150 publicstaticvoidmain(String[] args)throwsException{151 FeatureSelect fs=newFeatureSelect("/home/orisun/develop/workspace");152 fs.calIG(newFile("/home/orisun/matrix/part-r-00000"));153 ArrayList>al=fs.sort();154 fs.close();155 Iterator>iter=al.iterator();156 intn=0;157 File file=newFile("/home/orisun/features");158 try{159 file.createNewFile();160 FileWriter fw=newFileWriter(file);161 BufferedWriter bw=newBufferedWriter(fw);162 while(iter.hasNext()&&n++<300) {163 Entryentry=iter.next();164 bw.write(entry.getKey()+"\t");165 bw.write(String.valueOf(entry.getValue()));166 bw.newLine();167 }168 bw.flush();169 bw.close();170 }catch(Exception e) {171 e.printStackTrace();172 }173 }174 }
java 分类回归树_从原始文档到KNN分类算法实现(二)
最新推荐文章于 2022-05-05 10:56:44 发布