用Lucene 4.7对进行词频统计,使用的分词器为IKAnalyzer,对中文的支持较低。
中文词频统计仅包含中文,英文词频统计仅包含英文。可在中文解析器与英文解析器中进行修改。
在获取到排序好的词频后,可使用d3.cloud在web中展示出来。
Github地址:github.com/panzejia/WebGuide
package cn.iflin.project.participle;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import cn.iflin.project.model.WordModel;
import cn.iflin.project.participle.englishwords.EnglishParser;
import cn.iflin.project.participle.wordcloud.ChineseParser;
/**
主要进行对中文与英语进行词频计算
*/
public class WordsParticiple {
/**
* 判断文件夹是否存在之后调用索引计算词频
*/
private static ArrayList<WordModel> checkFile(String text, String articleId) {
ArrayList<WordModel> wordList = new ArrayList<WordModel>();
return wordList;
}
private static void addDoc(IndexWriter w, String text) throws IOException {
Document doc = new Document();
FieldType ft = new FieldType();
ft.setIndexed(true);// 存储
ft.setStored(true);// 索引
ft.setStoreTermVectors(true);
ft.setTokenized(true);
ft.setStoreTermVectorPositions(true);// 存储位置
ft.setStoreTermVectorOffsets(true);// 存储偏移量
doc.add(new Field("text", text, ft));
w.addDocument(doc);
}
/**
* 删除文件
*
* @param dir
* @return
*/
private static boolean deleteDir(File dir) {
if (dir.isDirectory()) {
String[] children = dir.list();
for (int i = 0; i < children.length; i++) {
boolean success = deleteDir(new File(dir, children[i]));
if (!success) {
return false;
}
}
}
// 目录此时为空,可以删除
return dir.delete();
}
/**
* 计算英文词频
*
* @param text
* @return
* @throws IOException
*/
public static ArrayList<WordModel> getTF(String text, String articleId, String tag) throws IOException {
ArrayList<WordModel> wordList = new ArrayList<WordModel>();
File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId);
WordsParticiple.deleteDir(file);
Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式,如果构造函数参数为false,那么使用最细粒度分词。
IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息
Directory fileindex;
fileindex = FSDirectory.open(file);
IndexWriter filew = new IndexWriter(fileindex, configfile);
try {
WordsParticiple.addDoc(filew, text);
} finally {
// 统一释放内存
filew.close();
}
try {
IndexReader reader = DirectoryReader.open(fileindex);
for (int i = 0; i < reader.numDocs(); i++) {
int docId = i;
Terms terms = reader.getTermVector(docId, "text");
if (terms == null)
continue;
TermsEnum termsEnum = terms.iterator(null);
BytesRef thisTerm = null;
while ((thisTerm = termsEnum.next()) != null) {
String termText = thisTerm.utf8ToString();
DocsEnum docsEnum = termsEnum.docs(null, null);
while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if(tag.equals("noLevel")){
WordModel wm = new WordModel();
wm.setWord(termText);
wm.setWordFrequency(docsEnum.freq());
wordList.add(wm);
}
else if (EnglishParser.checkEnglishWord(termText, tag)) {
WordModel wm = new WordModel();
wm.setWord(termText);
wm.setWordFrequency(docsEnum.freq());
wordList.add(wm);
}
}
}
}
reader.close();
fileindex.close();
} catch (Exception e) {
e.printStackTrace();
}
return wordList;
}
/**
* 计算中文词频
*
* @param text
* @return
* @throws IOException
*/
public static ArrayList<WordModel> getTF(String text, String articleId) throws IOException {
ArrayList<WordModel> wordList = new ArrayList<WordModel>();
File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId);
WordsParticiple.deleteDir(file);
Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式,如果构造函数参数为false,那么使用最细粒度分词。
IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息
Directory fileindex;
fileindex = FSDirectory.open(file);
IndexWriter filew = new IndexWriter(fileindex, configfile);
try {
WordsParticiple.addDoc(filew, text);
} finally {
// 统一释放内存
filew.close();
}
try {
IndexReader reader = DirectoryReader.open(fileindex);
for (int i = 0; i < reader.numDocs(); i++) {
int docId = i;
Terms terms = reader.getTermVector(docId, "text");
if (terms == null)
continue;
TermsEnum termsEnum = terms.iterator(null);
BytesRef thisTerm = null;
while ((thisTerm = termsEnum.next()) != null) {
String termText = thisTerm.utf8ToString();
DocsEnum docsEnum = termsEnum.docs(null, null);
while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (ChineseParser.isChinese(termText) && termText.length() >= 2) {
WordModel wm = new WordModel();
wm.setWord(termText);
wm.setWordFrequency(docsEnum.freq());
wordList.add(wm);
}
}
}
}
reader.close();
fileindex.close();
} catch (Exception e) {
e.printStackTrace();
}
return wordList;
}
}
接下来对英语进行词频排序
package cn.iflin.project.participle.englishwords;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import cn.iflin.project.model.WordModel;
import cn.iflin.project.participle.WordsParticiple;
import cn.iflin.project.participle.wordcloud.CalculateChineseFrequency;
public class CalculateEnglishFrequency extends WordsParticiple {
/**
* 返回排序后的词频
*
* @param text 需分词内容
* @param articleId 分词内容 属性 (数据库or用户自定义(temp))
* @param tag English:siji、liuji、kaoyan;Chinese:chinese
* @return
*/
public static ArrayList<WordModel> getWordFre(String text, String articleId, String tag) {
ArrayList<WordModel> wordList = new ArrayList<WordModel>();
try {
//过滤掉常见标点符号
text = EnglishParser.delPunctuation(text);
wordList = getTF(text, articleId, tag);
} catch (IOException e) {
e.printStackTrace();
}
// 定义排序规则
class SortByFre implements Comparator {
public int compare(Object o1, Object o2) {
WordModel s1 = (WordModel) o1;
WordModel s2 = (WordModel) o2;
return s2.getWordFrequency().compareTo(s1.getWordFrequency());
}
}
Collections.sort(wordList, new SortByFre());
return wordList;
}
}
最后是对中文词频进行排序
package cn.iflin.project.participle.wordcloud;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import cn.iflin.project.model.WordModel;
import cn.iflin.project.participle.WordsParticiple;
import cn.iflin.project.participle.englishwords.EnglishParser;
/**
* 计算词频
*
* @author Jaypan
*
*/
public class CalculateChineseFrequency extends WordsParticiple{
/**
* 返回排序后的词频
*
* @param text 需分词内容
* @param articleId 分词内容 属性 (数据库or用户自定义(temp))
* @param tag English:siji、liuji、kaoyan;Chinese:chinese
* @return
*/
public static ArrayList<WordModel> getWordFre(String text, String articleId) {
ArrayList<WordModel> wordList = new ArrayList<WordModel>();
try {
wordList = getTF(text, articleId);
} catch (IOException e) {
e.printStackTrace();
}
// 定义排序规则
class SortByFre implements Comparator {
public int compare(Object o1, Object o2) {
WordModel s1 = (WordModel) o1;
WordModel s2 = (WordModel) o2;
return s2.getWordFrequency().compareTo(s1.getWordFrequency());
}
}
Collections.sort(wordList, new SortByFre());
return wordList;
}
}
英语解析器
package cn.iflin.project.participle.englishwords;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class EnglishParser {
// public static void main(String[] args) {
// testReg("B. Closing offices on holidays.");
// }
public static String delPunctuation(String text) {
// 先去掉标点,再合并空格
Pattern p = Pattern.compile("[(.|,|\"|\\?|!|:;')]");// 这边增加所有的符号,例如要加一个'则变成[(.|,|\"|\\?|!|:|')],如果是特殊符号要加转换
Matcher m = p.matcher(text);// 这为要整理的字符串
String first = m.replaceAll("");
p = Pattern.compile(" {2,}");
m = p.matcher(first);
String second = m.replaceAll("");
return second;
}
//判断单词是否属于某一分类
public static boolean checkEnglishWord(String checkWord, String englishClass) {
ArrayList<String> words = getEnglishWords(englishClass);
for (String word : words) {
if (word.equals(checkWord)) {
return true;
}
}
return false;
}
/**
* 获取单词列表
*
* @param sourceName
* 单词级别
* @return 单词列表
*/
public static ArrayList<String> getEnglishWords(String sourceName) {
ArrayList<String> words = new ArrayList<String>();
String filePath = "C:\\Spider\\EnglishWords\\" + sourceName + ".txt";
try {
String encoding = "GBK";
File file = new File(filePath);
if (file.isFile() && file.exists()) {
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
BufferedReader br = new BufferedReader(read);
String lineText = null;
while ((lineText = br.readLine()) != null) {
words.add(lineText);
}
read.close();
} else {
System.out.println("找不到指定文件");
}
} catch (Exception e) {
System.out.println("读取文件出错");
e.printStackTrace();
}
return words;
}
}
中文解析器
package cn.iflin.project.participle.wordcloud;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ChineseParser {
// 判断一个字符是否是中文
private static boolean isChinese(char c) {
return c >= 0x4E00 && c <= 0x9FA5;// 根据字节码判断
}
// 判断一个字符串是否含有中文
public static boolean isChinese(String str) {
if (str == null)
return false;
for (char c : str.toCharArray()) {
if (isChinese(c))
return true;// 有一个中文字符就返回
}
return false;
}
//将一句话转换成数组
public static String[] changeList(String s) {
String[] data =new String[20];
Pattern pattern = Pattern.compile("[\u4e00-\u9fa5]*");
Matcher matcher = pattern.matcher(s);
int i =0;
while (matcher.find()){
if(matcher.group().equals("")){
continue;
}
data[i]=matcher.group();
i++;
}
return data;
}
}