word2vec学习笔记

2014年5月31日14:49:21

1.英文word2vec:

1.1.下载/git clone/SVN获取word2vec


查看执行权限:ls -l *.sh

增加可执行权限:chmod a+x *.sh

1.2.运行./demo-word.sh下载英文样本数据text8.gz

解压gunzip -c text8.gz > text8

1.3.运行./demo-word.sh,执行word2vec训练:

之后执行命令./distance vectors.bin计算word之间的相似度:

输入france:

输入china:

1.4.运行./demo-class进行K-Means聚类:


1.5.编写脚本计算任意两word之间的相似度:


2.中文word2vec:

2.1.数据来源: http://www.sogou.com/labs/dl/ca.html

来自若干新闻站点2012年6月—7月期间国内,国际,体育,社会,娱乐等18个频道的新闻数据,提供URL和正文信息。

数据格式:

2.2.抽取有用的句子,即上述语料库中的标题和内容。通过布隆过滤器去重。

布隆过滤器:参考此文http://sobuhu.com/algorithm/2013/03/04/java-bloom-filter.html

布隆过滤器:

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;


public class BloomFilter {
    private BitSet bitSet;
    private int bitSetSize;
    private int addedElements;
    private int hashFunctionNumber;
    /**
     * 构造一个布隆过滤器,过滤器的容量为c * n 个bit.
     * @param c 当前过滤器预先开辟的最大包含记录,通常要比预计存入的记录多一倍.
     * @param n 当前过滤器预计所要包含的记录.
     * @param k 哈希函数的个数,等同每条记录要占用的bit数.
     */
    public BloomFilter(int c, int n, int k) {
        this.hashFunctionNumber = k;
        this.bitSetSize = (int) Math.ceil(c * k);
        this.addedElements = n;
        this.bitSet = new BitSet(this.bitSetSize);
    }
    /**
     * 通过文件初始化过滤器.
     * @param file
     */
    public void init(String file) {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String line = reader.readLine();
            while (line != null && line.length() > 0) {
                this.put(line);
                line = reader.readLine();
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    public void put(String str) {
        int[] positions = createHashes(str.getBytes(), hashFunctionNumber);
        for (int i = 0; i < positions.length; i++) {
            int position = Math.abs(positions[i] % bitSetSize);
            bitSet.set(position, true);
        }
    }
    public boolean contains(String str) {
        byte[] bytes = str.getBytes();
        int[] positions = createHashes(bytes, hashFunctionNumber);
        for (int i : positions) {
            int position = Math.abs(i % bitSetSize);
            if (!bitSet.get(position)) {
                return false;
            }
        }
        return true;
    }
    /**
     * 得到当前过滤器的错误率.
     * @return
     */
    public double getFalsePositiveProbability() {
        // (1 - e^(-k * n / m)) ^ k
        return Math.pow((1 - Math.exp(-hashFunctionNumber * (double) addedElements / bitSetSize)),
                hashFunctionNumber);
    }
    /**
     * 将字符串的字节表示进行多哈希编码.
     * @param bytes 待添加进过滤器的字符串字节表示.
     * @param hashNumber 要经过的哈希个数.
     * @return 各个哈希的结果数组.
     */
    public static int[] createHashes(byte[] bytes, int hashNumber) {
        int[] result = new int[hashNumber];
        int k = 0;
        while (k < hashNumber) {
            result[k] = HashFunctions.hash(bytes, k);
            k++;
        }
        return result;
    }
    public static void main(String[] args) throws Exception {
        BloomFilter bloomfilter = new BloomFilter(30000000, 10000000, 8);
        System.out.println("Bloom Filter Initialize ... ");
        bloomfilter.init("data/base.txt");
        System.out.println("Bloom Filter Ready");
        System.out.println("False Positive Probability : "
                + bloomfilter.getFalsePositiveProbability());
        // 查找新数据
        List<String> result = new ArrayList<String>();
        long t1 = System.currentTimeMillis();
        BufferedReader reader = new BufferedReader(new FileReader("data/input.txt"));
        String line = reader.readLine();
        while (line != null && line.length() > 0) {
            if (!bloomfilter.contains(line)) {
                result.add(line);
            }
            line = reader.readLine();
        }
        reader.close();
        long t2 = System.currentTimeMillis();
        //System.out.println("Parse 9900000 items, Time : " + (t2 - t1) + "ms , find " + result.size() + " new items.");
       // System.out.println("Average : " + 9900000 / ((t2 - t1) / 1000) + " items/second");
    }
}
class HashFunctions {
    public static int hash(byte[] bytes, int k) {
        switch (k) {
            case 0:
                return RSHash(bytes);
            case 1:
                return JSHash(bytes);
            case 2:
                return ELFHash(bytes);
            case 3:
                return BKDRHash(bytes);
            case 4:
                return APHash(bytes);
            case 5:
                return DJBHash(bytes);
            case 6:
                return SDBMHash(bytes);
            case 7:
                return PJWHash(bytes);
        }
        return 0;
    }
    public static int RSHash(byte[] bytes) {
        int hash = 0;
        int magic = 63689;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = hash * magic + bytes[i];
            magic = magic * 378551;
        }
        return hash;
    }
    public static int JSHash(byte[] bytes) {
        int hash = 1315423911;
        for (int i = 0; i < bytes.length; i++) {
            hash ^= ((hash << 5) + bytes[i] + (hash >> 2));
        }
        return hash;
    }
    public static int ELFHash(byte[] bytes) {
        int hash = 0;
        int x = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = (hash << 4) + bytes[i];
            if ((x = hash & 0xF0000000) != 0) {
                hash ^= (x >> 24);
                hash &= ~x;
            }
        }
        return hash;
    }
    public static int BKDRHash(byte[] bytes) {
        int seed = 131;
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = (hash * seed) + bytes[i];
        }
        return hash;
    }
    public static int APHash(byte[] bytes) {
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            if ((i & 1) == 0) {
                hash ^= ((hash << 7) ^ bytes[i] ^ (hash >> 3));
            } else {
                hash ^= (~((hash << 11) ^ bytes[i] ^ (hash >> 5)));
            }
        }
        return hash;
    }
    public static int DJBHash(byte[] bytes) {
        int hash = 5381;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = ((hash << 5) + hash) + bytes[i];
        }
        return hash;
    }
    public static int SDBMHash(byte[] bytes) {
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = bytes[i] + (hash << 6) + (hash << 16) - hash;
        }
        return hash;
    }
    public static int PJWHash(byte[] bytes) {
        long BitsInUnsignedInt = (4 * 8);
        long ThreeQuarters = ((BitsInUnsignedInt * 3) / 4);
        long OneEighth = (BitsInUnsignedInt / 8);
        long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
        int hash = 0;
        long test = 0;
        for (int i = 0; i < bytes.length; i++) {
            hash = (hash << OneEighth) + bytes[i];
            if ((test = hash & HighBits) != 0) {
                hash = (int) ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
            }
        }
        return hash;
    }
}
根据上面布隆过滤器来抽取搜狗数据集中的句子:

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

/*
 * 2014年6月2日14:37:58
 * 从sougouCA 中抽取句子,去除重复。
 * @niuliqiang
 */
public class ExtractSentences {
	public String path = "E:\\数据\\Sogou数据\\web\\";
	public String small = "news_tensite_xml.dat";
	public String smallSens = "sentences.txt";
	//
	public BloomFilter bloomfilter = new BloomFilter(15000000, 5000000, 8);	//1500W, 500W
	
	public void extractSentences(){
		List<String> sentences = new ArrayList<String>();
		try {
			//BufferedReader br = new BufferedReader(new FileReader(path+small));
			FileInputStream fis = new FileInputStream(path+small); 
	        InputStreamReader isr = new InputStreamReader(fis, "GBK"); 
	        BufferedReader br = new BufferedReader(isr); 
	        //
			String line = "";
			int lines = 0;
			int actualNum = 0;
			int existedNum = 0;
			while((line=br.readLine())!=null){
				if(line.startsWith("<contenttitle>") || line.startsWith("<content>")){
					int index1 = line.indexOf(">");
					int index2 = line.lastIndexOf("<");
					line = line.substring(index1+1, index2);
					//
					if(bloomfilter.contains(line)==false){
						sentences.add(line);
						bloomfilter.put(line);
						actualNum++;
						//
						if(sentences.size() >=100000){
							//BufferedWriter bw = new BufferedWriter(new FileWriter(path+smallSens));
							FileOutputStream fos = new FileOutputStream(path+smallSens, true); 
					        OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
							for(int i=0; i<sentences.size(); i++){
								osw.write(sentences.get(i)+"\r\n");
							}
							osw.flush();
							osw.close();
							sentences.clear();
						}
					}
					else{
						existedNum++;
						//System.out.println(line);
					}
				}
				lines++;
				System.out.println("line : "+lines);
			}
			//
			if(sentences.size() >0){
				FileOutputStream fos = new FileOutputStream(path+smallSens, true); 
		        OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
				for(int i=0; i<sentences.size(); i++){
					osw.write(sentences.get(i)+"\r\n");
				}
				osw.flush();
				osw.close();
				sentences.clear();
			}
			//
			br.close();
			//
			System.out.println("(actualNum+existedNum)/totalNum"+" : "+actualNum+"+"+existedNum+"="+(actualNum+existedNum)+"/"+lines);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	 public static void main(String[] args){
		 ExtractSentences es = new ExtractSentences();
		 long start = System.currentTimeMillis();
		 es.extractSentences();
		 long end  = System.currentTimeMillis();
		 System.out.println("time: "+(end-start)+" ms");
	 }
}
实验结果:

数据源大小:1.43G

行数:7765398

新闻篇数:7765398/6 = 12988466

从中抽取到2588466行(标题or内容),去重之后有1636090行(重复的有952376行,占其58.2%),可见布隆过滤器还是起到很关键的作用。

抽取句子用时:100秒左右。

样例如下:


2.3.采用fudanNLP工具进行中文分词

fudanNLP:http://code.google.com/p/fudannlp/

分词较为简单。

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import edu.fudan.nlp.cn.tag.CWSTagger;
import edu.fudan.util.exception.LoadModelException;

/*
 * 2014年6月2日15:46:35
 * 对所有句子进行分词
 * @niuliqiang
 */
public class Segment {
	public String path = "E:\\数据\\Sogou数据\\web\\";
	public String sensPath = "sentences.txt";
	public String segsPath = "segment.txt";

	public void segment() {
		List<String> segments = new ArrayList<String>();
		try {
			FileInputStream fis = new FileInputStream(path + sensPath);
			InputStreamReader isr = new InputStreamReader(fis, "utf-8");
			BufferedReader br = new BufferedReader(isr);
			//
			String line = "";
			int lines = 0;
			CWSTagger tag = new CWSTagger( "./models/seg.m" );
			while((line= br.readLine())!=null){
				//
				String segs = tag.tag(line);			//分词结果 包含标点
				segments.add(segs);
				lines++;
				System.out.println("line : "+lines);
				if(segments.size() >10000){
					FileOutputStream fos = new FileOutputStream(path+segsPath, true); 
			        OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
					for(int i=0; i<segments.size(); i++){
						osw.write(segments.get(i)+"\r\n");
					}
					osw.flush();
					osw.close();
					segments.clear();
				}
			}
			if(segments.size() >0){
				FileOutputStream fos = new FileOutputStream(path+segsPath, true); 
		        OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
				for(int i=0; i<segments.size(); i++){
					osw.write(segments.get(i)+"\r\n");
				}
				osw.flush();
				osw.close();
				segments.clear();
			}
			//
			br.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (LoadModelException e) {
			e.printStackTrace();
		}
	}
	 public static void main(String[] args){
		 Segment seg = new Segment();
		 long start = System.currentTimeMillis();
		 seg.segment();
		 long end  = System.currentTimeMillis();
		 System.out.println("time: "+(end-start)/1000.0+" s");
	 }
}
实验结果:

句子行数(标题or新闻内容为一行):1636090

txt文本大小:1.46G

用时:49.8分钟

分词之后txt文本大小:1.73G

样例如下(中间为空格):



2.4.随机一个小的已经分好词的文件块来测试word2vec:

segment1.txt文件块大小:42.3M

利用word2vec来训练,编写shell脚本:mydemo-word.sh

如下图:输入文件为segment1.txt,输入为seg1out.txt,训练完成之后调用distance。


一些结果:


调用distance:

输入 装备:


输入 汽车:


输入 乌克兰:


输入 比赛:


K-Means聚类:

 

在小数据集上的实验已经可以看出一些端倪,我们期待在1.73G的数据集上有更好的结果。


2.5.大数据集上word2vec实验:

所有词组数量:689668,

训练时间:1小时多


调用distance结果:





OK,发现问题,很多的词受到新闻数据集里面的数字、分词错误等的很大影响,导致最后的结果不佳!

跟前面小数据集对比:




总结来看,大数据集上各种分词错误、数字等噪音影响很大,导致一部分的结果偏差太大!另外,小数据集噪音较小,结果在其数据集内还可以。对于不受噪音影响的词,大数据集下的结果更加好!如“子孙”等。

2.6.下一步工作

下一步准备在分词工具上改进,以及去除噪音!再来看结果如何。

纯属兴趣~





  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值