使用Lucene词频统计与d3.cloud展示的中文英文词云系统

最新推荐文章于 2024-04-20 15:26:30 发布

panzejia

最新推荐文章于 2024-04-20 15:26:30 发布

阅读量792

点赞数

分类专栏： Java 文章标签： web lucene 词频统计

本文链接：https://blog.csdn.net/panzejia/article/details/77825423

版权

Java 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

用Lucene 4.7对进行词频统计，使用的分词器为IKAnalyzer，对中文的支持较低。
中文词频统计仅包含中文，英文词频统计仅包含英文。可在中文解析器与英文解析器中进行修改。
在获取到排序好的词频后，可使用d3.cloud在web中展示出来。
Github地址：github.com/panzejia/WebGuide

package cn.iflin.project.participle;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import cn.iflin.project.model.WordModel;
import cn.iflin.project.participle.englishwords.EnglishParser;
import cn.iflin.project.participle.wordcloud.ChineseParser;
/**
主要进行对中文与英语进行词频计算
*/
public class WordsParticiple {
    /**
     * 判断文件夹是否存在之后调用索引计算词频
     */
    private static ArrayList<WordModel> checkFile(String text, String articleId) {
        ArrayList<WordModel> wordList = new ArrayList<WordModel>();
        return wordList;
    }


    private static void addDoc(IndexWriter w, String text) throws IOException {
        Document doc = new Document();
        FieldType ft = new FieldType();
        ft.setIndexed(true);// 存储
        ft.setStored(true);// 索引
        ft.setStoreTermVectors(true);
        ft.setTokenized(true);
        ft.setStoreTermVectorPositions(true);// 存储位置
        ft.setStoreTermVectorOffsets(true);// 存储偏移量
        doc.add(new Field("text", text, ft));
        w.addDocument(doc);
    }

    /**
     * 删除文件
     * 
     * @param dir
     * @return
     */
    private static boolean deleteDir(File dir) {
        if (dir.isDirectory()) {
            String[] children = dir.list();
            for (int i = 0; i < children.length; i++) {
                boolean success = deleteDir(new File(dir, children[i]));
                if (!success) {
                    return false;
                }
            }
        }
        // 目录此时为空，可以删除
        return dir.delete();
    }

    /**
     * 计算英文词频
     * 
     * @param text
     * @return
     * @throws IOException
     */
     public static ArrayList<WordModel> getTF(String text, String articleId, String tag) throws IOException {
        ArrayList<WordModel> wordList = new ArrayList<WordModel>();
        File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId);
        WordsParticiple.deleteDir(file);
        Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式，如果构造函数参数为false，那么使用最细粒度分词。
        IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息
        Directory fileindex;
        fileindex = FSDirectory.open(file);
        IndexWriter filew = new IndexWriter(fileindex, configfile);
        try {
            WordsParticiple.addDoc(filew, text);
        } finally {
            // 统一释放内存
            filew.close();
        }
        try {
            IndexReader reader = DirectoryReader.open(fileindex);
            for (int i = 0; i < reader.numDocs(); i++) {
                int docId = i;
                Terms terms = reader.getTermVector(docId, "text");
                if (terms == null)
                    continue;
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef thisTerm = null;
                while ((thisTerm = termsEnum.next()) != null) {
                    String termText = thisTerm.utf8ToString();
                    DocsEnum docsEnum = termsEnum.docs(null, null);
                    while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                        if(tag.equals("noLevel")){
                            WordModel wm = new WordModel();
                            wm.setWord(termText);
                            wm.setWordFrequency(docsEnum.freq());
                            wordList.add(wm);
                        }
                        else if (EnglishParser.checkEnglishWord(termText, tag)) {
                            WordModel wm = new WordModel();
                            wm.setWord(termText);
                            wm.setWordFrequency(docsEnum.freq());
                            wordList.add(wm);
                        }
                    }
                }
            }
            reader.close();
            fileindex.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return wordList;
    }

    /**
     * 计算中文词频
     * 
     * @param text
     * @return
     * @throws IOException
     */
     public static ArrayList<WordModel> getTF(String text, String articleId) throws IOException {
        ArrayList<WordModel> wordList = new ArrayList<WordModel>();
        File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId);
        WordsParticiple.deleteDir(file);
        Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式，如果构造函数参数为false，那么使用最细粒度分词。
        IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息
        Directory fileindex;
        fileindex = FSDirectory.open(file);
        IndexWriter filew = new IndexWriter(fileindex, configfile);
        try {
            WordsParticiple.addDoc(filew, text);
        } finally {
            // 统一释放内存
            filew.close();
        }
        try {
            IndexReader reader = DirectoryReader.open(fileindex);
            for (int i = 0; i < reader.numDocs(); i++) {
                int docId = i;
                Terms terms = reader.getTermVector(docId, "text");
                if (terms == null)
                    continue;
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef thisTerm = null;
                while ((thisTerm = termsEnum.next()) != null) {
                    String termText = thisTerm.utf8ToString();
                    DocsEnum docsEnum = termsEnum.docs(null, null);
                    while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                        if (ChineseParser.isChinese(termText) && termText.length() >= 2) {
                            WordModel wm = new WordModel();
                            wm.setWord(termText);
                            wm.setWordFrequency(docsEnum.freq());
                            wordList.add(wm);
                        }
                    }
                }
            }
            reader.close();
            fileindex.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return wordList;
    }

}

接下来对英语进行词频排序

package cn.iflin.project.participle.englishwords;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import cn.iflin.project.model.WordModel;
import cn.iflin.project.participle.WordsParticiple;
import cn.iflin.project.participle.wordcloud.CalculateChineseFrequency;

public class CalculateEnglishFrequency extends WordsParticiple {
    /**
     * 返回排序后的词频
     * 
     * @param text 需分词内容
     * @param articleId 分词内容 属性 （数据库or用户自定义(temp)）
     * @param tag English：siji、liuji、kaoyan;Chinese：chinese
     * @return
     */
    public static ArrayList<WordModel> getWordFre(String text, String articleId, String tag) {
        ArrayList<WordModel> wordList = new ArrayList<WordModel>();
        try {
            //过滤掉常见标点符号
            text = EnglishParser.delPunctuation(text);
            wordList = getTF(text, articleId, tag);
        } catch (IOException e) {
            e.printStackTrace();
        }
        // 定义排序规则
        class SortByFre implements Comparator {
            public int compare(Object o1, Object o2) {
                WordModel s1 = (WordModel) o1;
                WordModel s2 = (WordModel) o2;
                return s2.getWordFrequency().compareTo(s1.getWordFrequency());
            }
        }
        Collections.sort(wordList, new SortByFre());
        return wordList;
    }
}

最后是对中文词频进行排序

package cn.iflin.project.participle.wordcloud;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import cn.iflin.project.model.WordModel;
import cn.iflin.project.participle.WordsParticiple;
import cn.iflin.project.participle.englishwords.EnglishParser;

/**
 * 计算词频
 * 
 * @author Jaypan
 *
 */
public class CalculateChineseFrequency  extends WordsParticiple{

    /**
     * 返回排序后的词频
     * 
     * @param text 需分词内容
     * @param articleId 分词内容 属性 （数据库or用户自定义(temp)）
     * @param tag English：siji、liuji、kaoyan;Chinese：chinese
     * @return
     */
    public static ArrayList<WordModel> getWordFre(String text, String articleId) {
        ArrayList<WordModel> wordList = new ArrayList<WordModel>();
        try {
            wordList = getTF(text, articleId);
        } catch (IOException e) {
            e.printStackTrace();
        }
        // 定义排序规则
        class SortByFre implements Comparator {
            public int compare(Object o1, Object o2) {
                WordModel s1 = (WordModel) o1;
                WordModel s2 = (WordModel) o2;
                return s2.getWordFrequency().compareTo(s1.getWordFrequency());
            }
        }
        Collections.sort(wordList, new SortByFre());
        return wordList;
    }
}

英语解析器

package cn.iflin.project.participle.englishwords;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class EnglishParser {
//  public static void main(String[] args) {
//      testReg("B. Closing offices on holidays.");
//  }

    public static String delPunctuation(String text) {
        // 先去掉标点,再合并空格
        Pattern p = Pattern.compile("[(.|,|\"|\\?|!|:;')]");// 这边增加所有的符号,例如要加一个'则变成[(.|,|\"|\\?|!|:|')],如果是特殊符号要加转换
        Matcher m = p.matcher(text);// 这为要整理的字符串
        String first = m.replaceAll("");
        p = Pattern.compile("   {2,}");
        m = p.matcher(first);
        String second = m.replaceAll("");
        return second;
    }
    //判断单词是否属于某一分类
    public static boolean checkEnglishWord(String checkWord, String englishClass) {
        ArrayList<String> words = getEnglishWords(englishClass);
        for (String word : words) {
            if (word.equals(checkWord)) {
                return true;
            }
        }
        return false;
    }

    /**
     * 获取单词列表
     * 
     * @param sourceName
     *            单词级别
     * @return 单词列表
     */
    public static ArrayList<String> getEnglishWords(String sourceName) {
        ArrayList<String> words = new ArrayList<String>();
        String filePath = "C:\\Spider\\EnglishWords\\" + sourceName + ".txt";
        try {
            String encoding = "GBK";
            File file = new File(filePath);
            if (file.isFile() && file.exists()) {
                InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
                BufferedReader br = new BufferedReader(read);
                String lineText = null;
                while ((lineText = br.readLine()) != null) {
                    words.add(lineText);
                }
                read.close();
            } else {
                System.out.println("找不到指定文件");
            }
        } catch (Exception e) {
            System.out.println("读取文件出错");
            e.printStackTrace();
        }
        return words;

    }
}

中文解析器

package cn.iflin.project.participle.wordcloud;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ChineseParser {
    // 判断一个字符是否是中文
    private static boolean isChinese(char c) {
        return c >= 0x4E00 && c <= 0x9FA5;// 根据字节码判断
    }

    // 判断一个字符串是否含有中文
    public static boolean isChinese(String str) {
        if (str == null)
            return false;
        for (char c : str.toCharArray()) {
            if (isChinese(c))
                return true;// 有一个中文字符就返回
        }
        return false;
    }
    //将一句话转换成数组
    public static String[] changeList(String s) {
        String[] data =new String[20];
        Pattern pattern =  Pattern.compile("[\u4e00-\u9fa5]*");
        Matcher matcher = pattern.matcher(s);
        int i =0;
        while (matcher.find()){
            if(matcher.group().equals("")){
                continue;
            }
            data[i]=matcher.group();
            i++;
        }
        return data;
    }
}

panzejia

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
使用Lucene词频统计与d3.cloud展示的中文英文词云系统

用Lucene 4.7对进行词频计算统计，使用的分词器为IKAnalyzer，对中文的支持较低。中文计算词频仅包含中文，英文计算词频仅包含英文。可在中文解析器与英文解析器中进行修改。在获取到排序好的词频后，可使用d3.cloud在web中展示出来。 Github地址：github.com/panzejia/WebGuidepackage cn.iflin.project.participl
复制链接

扫一扫