新词发现及Java和spark实现

最新推荐文章于 2024-04-08 22:48:31 发布

songhk0209

最新推荐文章于 2024-04-08 22:48:31 发布

阅读量2.1k

点赞数 1

分类专栏：机器学习 NLP 新词发现文章标签：新词发现 spark 数据挖掘 nlp

本文链接：https://blog.csdn.net/songhk0209/article/details/77744507

版权

NLP 同时被 3 个专栏收录

5 篇文章 2 订阅

订阅专栏

机器学习

1 篇文章 0 订阅

订阅专栏

新词发现

1 篇文章 0 订阅

订阅专栏

新词发现并不是一个新的课题，但最有意思的一点是如果采用无监督的算法，可以完全脱离人工的经验由算法自动找到有语意的“词语”，而不是胡乱拼凑的汉字片段（归因于算法的有效性和语料本身是由有意义的词语构成的）。本文参考了matrix67的一篇文章，互联网时代的社会语言学：基于SNS的文本数据挖掘，采用无监督方法来发现新词，基本原理就是通过N-gram找到可能的词，然后计算这些词的词频、紧密度和自由度，最终通过一组阈值得到，具体可以参考上文链接，原文解释的非常详细和深入浅出。ok,talk is cheap, 下面给出我的实现。

在matrix67 blog的基础上，根据具体业务进一步调整了新词发现算法，包括但不限于（等项目上线后会给出核心代码）：


1. 增加了对应词库中的已有词，常用高频词语，常用停用词 来剔除新词结果的无关词
2.  去掉待处理语料中的垃圾语料和脏数据（广告等） 
3. 词内compactness的定义改为对称条件概率（PMI）的定义（对长度为2以上的新词支持更好） 
4. 如果首字为量词（个、件、次等），且左邻字大部分为数字（1、2、两等），则改词不作为新词 
5. 计算新词得分score时综合考虑词频和紧密度（更偏向与紧密度，暂不考虑词外自由度，特征不鲁棒），更具score得到top N的新词

6. 去掉新词结果中的语气叠词（呵呵，嘻嘻嘻，哈哈哈哈哈等） 
7. 去掉新词结果中包含高频常用字（了，是、不，么等），且词频高，紧密度低 的词 
8. 去掉包含2个以上的高频常用字（了，是、不，么等）的新词 考虑新词结果中父串（如‘西门庆’）、子串（门庆），若两者词频相近，则去子串保留父串；如果父串（如大野怪）、子串（‘野怪’）的词频不相近，则去父串保留子串。

matrix67的新词发现文章的 java实现的单机运行版本：

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class findNewWord {
    public boolean if_remove_oldWors = true;// 是否在n-gram阶段就滤掉旧词
    public String str_in="我现在正在玩lol，现在战况激烈可能现在没办法抽空回复你";
    int space_num = 4;
    public int len_max = 4;
    double compactness_t = 1000;
    double flexible_t = 0.5;
    double wcount_t = 5;
    public HashMap<String,String> w_Map = new HashMap<String,String>();
    public HashMap<String,String> nw_Map = new HashMap<String,String>();
    public Set<String> wset = new HashSet<String>();
    public int str_len = str_in.length();
    public int game_id;
    public String date_time;
    //
    public static Set<String> foundWords = new HashSet<String>();
    public String cm = "99999999999999999";
    public int count_max = Integer.parseInt(cm.substring(0,space_num));
    //
    // 开始统计词信息
    public void wordStatics(){
        str_in = str_in.replaceAll("[^\u4E00-\u9FA5^a-z^A-Z^0-9^\\>]", "");//[^a-z^A-Z^0-9]
        str_len = str_in.length();
        if(str_len<2) return;

        // 初始化，减少包含检索
        String spacel = "                                  ";
        String info_ini = 0+",的:0"+spacel.substring(0,space_num-1)+",的:0"+spacel.substring(0,space_num-1);
        // n-gram (n=len_max)
        for(int i=0;i<str_len;i++){
            if(str_in.charAt(i)=='>') continue;
            int flag = 0;//判断是否遇到句尾
            for(int j=0;j<=len_max;j++){
                if(i+j+1>str_len) continue;
                String cur_w = str_in.substring(i, i+j+1);//当前词
                if(cur_w.charAt(cur_w.length()-1)=='>'){//当前词末尾已经是完结符,则其不构成一个词，不用统计
                    flag = 1;
                }
                if(flag == 1) continue;
                // 初始化
                wset.add(cur_w);
                w_Map.put(cur_w, info_ini);
            }
        }

        // 开始统计
        for(int i=0;i<str_in.length();i++){
            if(str_in.charAt(i)=='>') continue;
            int flag = 0;//判断是否遇到句尾 or 是旧词
            for(int j=0;j<=len_max;j++){
                if(i+j+1>str_len) continue;
                String cur_w = str_in.substring(i, i+j+1);//当前词
                if(cur_w.charAt(cur_w.length()-1)=='>'){//当前词末尾已经是完结符
                    flag = 1;
                }
                if(flag == 1) continue;
                //
                String info_all = w_Map.get(cur_w);
                String info[] = info_all.split(",");
                int num = Integer.parseInt(info[0])+1;
                //
                String left_w = "";
                if(i!=0 && str_in.charAt(i-1)!='>')  left_w += str_in.charAt(i-1);
                String right_w = "";
                if((i+j+1)<str_len && str_in.charAt(i+j+1)!='>')  right_w += str_in.charAt(i+j+1);
                //
                info_all = updateWordStatic(info_all, left_w, "left");
                info_all = updateWordStatic(info_all, right_w, "right");
                info_all = info_all.substring((info_all.indexOf(',', 0)));
                w_Map.put(cur_w, num+info_all);
                if(cur_w.equals("现在")){
                    //System.out.println(cur_w+"==>:"+num+","+left_w+","+right_w);
                }
            }
        }
    }
    // 更新词统计信息
    public String updateWordStatic(String str,String w_cur,String place){
        if(w_cur.length()<1) return str;
        String[] str_l = str.split(",");
        String spacel = "                                  ";
        if(place.equals("left")){
            int i_w = str_l[1].indexOf(w_cur+":");
            //如果找到
            if(i_w>0){
                int i_n = i_w + w_cur.length()+1;
                int i_b = i_n+space_num;

                int num_cur = Integer.parseInt(str_l[1].substring(i_n, i_n+space_num).replaceAll(" ", ""))+1;
                String nums = "                           ";
                nums = nums.substring(0,space_num);
                if(num_cur<count_max){
                    String num_t = num_cur +  "";
                    nums = num_t + nums.substring(num_t.length());
                }
                else num_cur=count_max;
                str = str_l[0] + ","+str_l[1].substring(0, i_w) 
                        + w_cur+":"+nums
                        +str_l[1].substring(i_b)+","+str_l[2];
            }
            // 如果没有
            else{

                str = str_l[0] + ","+str_l[1]+w_cur+":"+"1"+spacel.substring(0,space_num-1)
                        +","+str_l[2];
            }
        }
        if(place.equals("right")){
            int i_w = str_l[2].indexOf(w_cur+":");
            //如果找到
            if(i_w>0){
                int i_n = i_w + w_cur.length()+1;
                int i_b = i_n+space_num;

                int num_cur = Integer.parseInt(str_l[2].substring(i_n, i_n+space_num).replaceAll(" ", ""))+1;
                String nums = "                           ";
                nums = nums.substring(0,space_num);
                if(num_cur<count_max){
                    String num_t = num_cur +  "";
                    nums = num_t + nums.substring(num_t.length());
                }
                else num_cur=count_max;
                str = str_l[0] + ","+str_l[1]+","+str_l[2].substring(0, i_w) 
                        + w_cur+":"+nums
                        +str_l[2].substring(i_b);
            }
            // 如果没有
            else{
                str = str_l[0] + ","+str_l[1]
                        +","+str_l[2]+w_cur+":"+"1"+spacel.substring(0,space_num-1);
            }
        }
        return str;
    }
    // 计算词内凝固度、词外自由度 发现新词
    public HashMap<String,String> findWords() throws Exception{
        System.out.println("existing word num: "+foundWords.size());
        //
        for(String s:wset){
            if(s.length()>1 && s.replaceAll("[^\u4E00-\u9FA5]", "").length()>0){//[^a-z^A-Z^0-9]){
                // 计算词内凝固度
                String info[] = w_Map.get(s).split(",");
                double compact_score = Double.MAX_VALUE;
                Integer s0 = Integer.parseInt(info[0]);//词频
                for(int i=0;i<s.length()-1;i++){
                    // 计算词内凝固度
                    //如果能分成两半
                    if(wset.contains(s.substring(0, i+1)) & wset.contains(s.substring(i+1))){
                        double s1 = 1.0*Integer.parseInt(w_Map.get(s.substring(0, i+1)).split(",")[0]);//词频
                        double s2 = 1.0*Integer.parseInt(w_Map.get(s.substring(i+1)).split(",")[0]);//词频
                        double score_comp = s0*str_len*1.0/(s1*s2);//分母是随机组合的概率,比值越大越是人组的语意的词
                        //
                        compact_score = Math.min(compact_score, score_comp );
                    }
                }

                // 计算词外自由度
                double left_f = culWordsEtropy(info[1]);
                double right_f =culWordsEtropy(info[2]);
                double flexible = Math.min(left_f, right_f );

                // 如果满足条件就加入潜在新词
                if(compact_score>compactness_t 
                        && flexible>flexible_t 
                        && s0>wcount_t 
                        && (!foundWords.contains(s)))
                    nw_Map.put(s, s0+","+compact_score+","+flexible);
            }
        }
        return nw_Map;
    }

    // 计算词熵
    public double culWordsEtropy(String str){
        int sumc = 0;
        List<Integer> num_l =  new ArrayList<Integer>();
        for(int i=0;i<str.length();i++){
            if(str.charAt(i)==':'){
                Integer temp = Integer.parseInt(str.substring(i+1, i+5).replaceAll(" ", ""));
                if(temp>0){
                    num_l.add(temp);
                    sumc += temp;
                }
            }
        }
        //
        double strEtropy = 0.0;
        for(Integer num:num_l){
            strEtropy += (-1.0) * (num*1.0/(sumc*1.0)) * (Math.log(num*1.0/(sumc*1.0)));
        }
        return strEtropy;
    }
    public static void saveStrCSV(String file, String conent) {
        BufferedWriter out = null;
        try {
            out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
            out.write(conent+"\r\n");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    public static void main(String args[]) throws Exception{ 
        String dataPath = "E:/learningData/NLP/some_text_doc/金瓶梅.txt";//西游记
        //
        InputStreamReader read = new InputStreamReader(new FileInputStream(dataPath),"gbk"); //,"gbk","UTF-8"
        @SuppressWarnings("resource")
        BufferedReader currBR = new BufferedReader(read);
        String tempCurrLine = "";
        String con = "";
        while ((tempCurrLine = currBR.readLine()) != null) {
            con +=tempCurrLine;
        }
        //
        findNewWord fnw = new findNewWord();
        fnw.str_in = con;
        fnw.wordStatics();
        HashMap<String,String> fw = fnw.findWords();
        System.out.println("\n\n\nnew words num: " + fw.size()); 
        //
        List<Map.Entry<String, String>> entryList = new ArrayList<Map.Entry<String, String>>(fw.entrySet());
        Collections.sort(entryList, new Comparator<Map.Entry<String, String>>() 
                {
                    @Override
                    public int compare(Map.Entry<String, String> o1, Map.Entry<String, String> o2) {
                        Integer num1 = Integer.parseInt(o1.getValue().split(",")[0]);
                        Integer num2 = Integer.parseInt(o2.getValue().split(",")[0]);
                        Double com1 = Double.valueOf(o1.getValue().split(",")[1]);
                        Double com2 = Double.parseDouble(o2.getValue().split(",")[1]);
                        //return com2.compareTo(com1);
                        return num2.compareTo(num1);
                    }
                });
        for (int i = 0; i < 300; ++i) {
            System.out.println(entryList.get(i).getKey()+"\t-->"+ entryList.get(i).getValue());
        }
    }
}

好了，让算法读本‘金瓶梅’测试下效果，部分新词输出如下：

吩咐  -->405,1529.4558139534884,4.0082581300545606
伙计  -->241,1145.2131936416185,2.423341693467576
雪娥  -->239,1332.8995039219842,3.082098823200978
韩道国 -->211,1172.4191111862115,4.061459127893269
左右  -->206,1895.2380392815176,3.5324181923556903
晚夕  -->201,1046.5672754910577,3.708572332910948
提刑  -->195,1174.3713084805363,2.0821136913180194
师父  -->152,1142.9169610701424,2.8967660115396217
兄弟  -->149,1424.9685027919963,3.7470579082813456
秋菊  -->147,2261.07776504432,3.7488604958114653
良久  -->146,1553.7093203883496,3.873346472349265
御史  -->145,3226.143306607125,1.3085725892219808
须臾  -->144,2244.59385665529,4.067556537602757
伺候  -->141,2683.1859375,3.619320955768149
悄悄  -->122,1196.0950492687944,3.6785419885382886
乔大户 -->120,1407.0731707317073,2.7895116233201307
丫鬟  -->119,1340.5661870503598,3.6447949795948507
夏提刑 -->118,1490.53275713051,3.89193214636793
姥姥  -->116,1417.3836206896551,1.6372525278308285
告诉  -->114,1353.5642534753565,3.015068941026971
达达  -->108,1034.7288619544315,2.603724058730813
卷棚  -->106,2458.2177086639163,1.6620497616520689
慢慢  -->99,1084.6969429404414,3.4736352593048228
孙雪娥 -->93,1168.5473720410387,3.509899048916368
常峙节 -->84,1226.9888059701493,3.5717010220730745
崇祯  -->81,6985.437450826121,1.0546115534488711
傅伙计 -->81,1624.1142073170731,3.73473063195388
冯妈妈 -->81,1263.9021068615355,3.4071486331531933
琵琶  -->79,8324.886075949367,3.26117688842775

谁说金瓶梅是小黄书？明明是本正经名著么：）。一本西游记或者金瓶梅的txt文件大小大概1.5M，80w字左右，程序在8g内存的电脑上运行时间大概为1min以内，还是可以接受的。
但是如果文本文件容量进一步增大，就要分布式方式来计算了，毕竟基于原生的n-gram算法还是很吃内存的。由于公司相关的业务还在开发测试阶段，下面先就给出核心的代码片段。思路是从hdfs或者hive中取数据，将达文本切分成小文本，并利用上述算法在各个partitions中统计出新词，最后汇总各个partitions中的结果，collection到本地。这里吐槽一下spark的最常用的Dataset,Row支持很多的数据结构，但是对于map,只支持存储scala的mutable.Map，不支持可变的java map。折腾了好久，最后还是回到javaRDD来做了，或者Dataset

    JavaRDD<Map<String, String>> statistic_info_pa = 
                content_no.javaRDD().mapPartitions( 
                //new Function2<Integer, Iterator<Integer>, Iterator<String>>()
                new FlatMapFunction<Iterator<Row>, Map<String,String>>() {
                    private static final long serialVersionUID = 1L;
                    @Override
                    public Iterator<Map<String, String>> call(Iterator<Row> v_in) throws Exception {
                        //
                        LinkedList<Map<String, String>> linkedList = new LinkedList<Map<String, String>>(); 
                        String str_in="";
                        while(v_in.hasNext()){
                            Row v1 = v_in.next();
                            String content = v1.getString(1).replaceAll("[^\u4E00-\u9FA5^a-z^A-Z^0-9]", "");
                            str_in = content;
                            //
                            NewWordsDiscovery fnw = new NewWordsDiscovery();
                            NewWordsDiscovery.foundWords = colIndexNew.getValue();
                            fnw.str_in = str_in;
                            fnw.wordStatics();
                            Map<String,String> fw = fnw.findWords();
                            // 每个partitions只取紧密度 top 1000
                            fw = NewWordsDiscovery.getTopNewWordsMapByCompactness(fw,1000);
                            linkedList.add(fw);
                        }
                        return linkedList.iterator();
                    }
                }
                );

    // 将所有新词取回本地并汇总
    List<Map<String,String>> fw_list = statistic_info_pa.collect(); 
    Map<String, String> fw_res = new HashMap<>();
    for(Map<String,String> hm:fw_list){
        Iterator<Entry<String,String>> entryIt = hm.entrySet().iterator();  
        while(entryIt.hasNext()){  
            Entry<String, String> entry = entryIt.next();  
            String w = entry.getKey();  
            if(notKouQiCi(w)){
                // 没有则添加
                if(!fw_res.containsKey(w)){
                    String info = entry.getValue(); 
                    fw_res.put(w, info);
                }
                // 否则合并
                else{
                    String info[] = entry.getValue().split(","); 
                    String info_0[] = fw_res.get(w).split(",");
                    Integer num1 = Integer.parseInt(info_0[0]);
                    Integer num2 = Integer.parseInt(info[0]);
                    Double com1 = Double.valueOf(info_0[1]);
                    Double com2 = Double.parseDouble(info[1]);
                    Double flex1 = Double.valueOf(info_0[2]);
                    Double flex2 = Double.parseDouble(info[2]);
                    fw_res.put(w, (num1+num2)+","+(com1+com2)+","+(flex1+flex2));
                }
            }
        }
    }

songhk0209

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
3
评论
新词发现及Java和spark实现

新词发现并不是一个新的课题，但最有意思的一点是如果采用无监督的算法，可以完全脱离人工的经验由算法自动找到有语意的“词语”，而不是胡乱拼凑的汉字片段（归因于算法的有效性和语料本身是由有意义的词语构成的）。本文参考了matrix67的一篇文章，[互联网时代的社会语言学：基于SNS的文本数据挖掘](http://www.matrix67.com/blog/archives/5044)，采用无监督方法来发现
复制链接

扫一扫