word2vec学习笔记_word2vec 权限未定义-CSDN博客

本文链接：https://blog.csdn.net/simpleniulq/article/details/27816965

2014年5月31日14:49:21

1.英文word2vec：

1.1.下载/git clone/SVN获取word2vec

查看执行权限：ls -l *.sh

增加可执行权限：chmod a+x *.sh

1.2.运行./demo-word.sh下载英文样本数据text8.gz

解压gunzip -c text8.gz > text8

1.3.运行./demo-word.sh，执行word2vec训练：

之后执行命令./distance vectors.bin计算word之间的相似度：

输入france：

输入china：

1.4.运行./demo-class进行K-Means聚类：

1.5.编写脚本计算任意两word之间的相似度：

2.中文word2vec：

2.1.数据来源： http://www.sogou.com/labs/dl/ca.html

来自若干新闻站点2012年6月—7月期间国内，国际，体育，社会，娱乐等18个频道的新闻数据，提供URL和正文信息。

数据格式：

2.2.抽取有用的句子，即上述语料库中的标题和内容。通过布隆过滤器去重。

布隆过滤器：参考此文http://sobuhu.com/algorithm/2013/03/04/java-bloom-filter.html

布隆过滤器：

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;


public class BloomFilter {
    private BitSet bitSet;
    private int bitSetSize;
    private int addedElements;
    private int hashFunctionNumber;
    /**
     * 构造一个布隆过滤器，过滤器的容量为c * n 个bit.
     * @param c 当前过滤器预先开辟的最大包含记录,通常要比预计存入的记录多一倍.
     * @param n 当前过滤器预计所要包含的记录.
     * @param k 哈希函数的个数，等同每条记录要占用的bit数.
     */
    public BloomFilter(int c, int n, int k) {
        this.hashFunctionNumber = k;
        this.bitSetSize = (int) Math.ceil(c * k);
        this.addedElements = n;
        this.bitSet = new BitSet(this.bitSetSize);
    }
    /**
     * 通过文件初始化过滤器.
     * @param file
     */
    public void init(String file) {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String line = reader.readLine();
            while (line != null && line.length() > 0) {
                this.put(line);
                line = reader.readLine();
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    public void put(String str) {
        int[] positions = createHashes(str.getBytes(), hashFunctionNumber);
        for (int i = 0; i < positions.length; i++) {
            int position = Math.abs(positions[i] % bitSetSize);
            bitSet.set(position, true);
        }
    }
    public boolean contains(String str) {
        byte[] bytes = str.getBytes();
        int[] positions = createHashes(bytes, hashFunctionNumber);
        for (int i : positions) {
            int position = Math.abs(i % bitSetSize);
            if (!bitSet.get(position)) {
                return false;
            }
        }
        return true;
    }
    /**
     * 得到当前过滤器的错误率.
     * @return
     */
    public double getFalsePositiveProbability() {
        // (1 - e^(-k * n / m)) ^ k
        return Math.pow((1 - Math.exp(-hashFunctionNumber * (double) addedElements / bitSetSize)),
                hashFunctionNumber);
    }
    /**
     * 将字符串的字节表示进行多哈希编码.
     * @param bytes 待添加进过滤器的字符串字节表示.
     * @param hashNumber 要经过的哈希个数.
     * @return 各个哈希的结果数组.
     */
    public static int[] createHashes(byte[] bytes, int hashNumber) {
        int[] result = new int[hashNumber];
        int k = 0;
        while (k < hashNumber) {
            result[k] = HashFunctions.hash(bytes, k);
            k++;
        }
        return result;
    }
    public static void main(String[] args) throws Exception {
        BloomFilter bloomfilter = new BloomFilter(30000000, 10000000, 8);
        System.out.println("Bloom Filter Initialize ... ");
        bloomfilter.init("data/base.txt");
        System.out.println("Bloom Filter Ready");
        System.out.println("False Positive Probability : "
                + bloomfilter.getFalsePositiveProbability());
        // 查找新数据
        List<String> result = new ArrayList<String>();
        long t1 = System.currentTimeMillis();
        BufferedReader reader = new BufferedReader(new FileReader("data/input.txt"));
        String line = reader.readLine();
        while (line != null && line.length() > 0) {
            if (!bloomfilter.contains(line)) {
                result.add(line);
            }
            line = reader.readLine();
        }
        reader.close();
        long t2 = System.currentTimeMillis();
        //System.out.println("Parse 9900000 items, Time : " + (t2 - t1) + "ms , find " + result.size() + " new items.");
       // System.out.println("Average : " + 9900000 / ((t2 - t1) / 1000) + " items/second");
    }
}
class HashFunctions {
    public static int hash(byte[] bytes, int k) {
        switch (k) {
            case 0:
                return RSHash(bytes);
            case 1:
                return JSHash(bytes);
            case 2:
                return ELFHash(bytes);
            case 3:
                return BKDRHash(bytes);
            case 4:
                return APHash(bytes);
            case 5:
                return DJBHash(bytes);
            case 6:
                return SDBMHash(bytes);
            case 7:
                return PJWHash(bytes);
        }
        return 0;
    }
    public static int RSHash(byte[] bytes) {
        int hash = 0;
        int magic = 63689;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = hash * magic + bytes[i];
            magic = magic * 378551;
        }
        return hash;
    }
    public static int JSHash(byte[] bytes) {
        int hash = 1315423911;
        for (int i = 0; i < bytes.length; i++) {
            hash ^= ((hash << 5) + bytes[i] + (hash >> 2));
        }
        return hash;
    }
    public static int ELFHash(byte[] bytes) {
        int hash = 0;
        int x = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = (hash << 4) + bytes[i];
            if ((x = hash & 0xF0000000) != 0) {
                hash ^= (x >> 24);
                hash &= ~x;
            }
        }
        return hash;
    }
    public static int BKDRHash(byte[] bytes) {
        int seed = 131;
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = (hash * seed) + bytes[i];
        }
        return hash;
    }
    public static int APHash(byte[] bytes) {
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            if ((i & 1) == 0) {
                hash ^= ((hash << 7) ^ bytes[i] ^ (hash >> 3));
            } else {
                hash ^= (~((hash << 11) ^ bytes[i] ^ (hash >> 5)));
            }
        }
        return hash;
    }
    public static int DJBHash(byte[] bytes) {
        int hash = 5381;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = ((hash << 5) + hash) + bytes[i];
        }
        return hash;
    }
    public static int SDBMHash(byte[] bytes) {
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = bytes[i] + (hash << 6) + (hash << 16) - hash;
        }
        return hash;
    }
    public static int PJWHash(byte[] bytes) {
        long BitsInUnsignedInt = (4 * 8);
        long ThreeQuarters = ((BitsInUnsignedInt * 3) / 4);
        long OneEighth = (BitsInUnsignedInt / 8);
        long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
        int hash = 0;
        long test = 0;
        for (int i = 0; i < bytes.length; i++) {
            hash = (hash << OneEighth) + bytes[i];
            if ((test = hash & HighBits) != 0) {
                hash = (int) ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
            }
        }
        return hash;
    }
}

根据上面布隆过滤器来抽取搜狗数据集中的句子：

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

/*
 * 2014年6月2日14:37:58
 * 从sougouCA 中抽取句子，去除重复。
 * @niuliqiang
 */
public class ExtractSentences {
	public String path = "E:\\数据\\Sogou数据\\web\\";
	public String small = "news_tensite_xml.dat";
	public String smallSens = "sentences.txt";
	//
	public BloomFilter bloomfilter = new BloomFilter(15000000, 5000000, 8);	//1500W, 500W
	
	public void extractSentences(){
		List<String> sentences = new ArrayList<String>();
		try {
			//BufferedReader br = new BufferedReader(new FileReader(path+small));
			FileInputStream fis = new FileInputStream(path+small); 
	        InputStreamReader isr = new InputStreamReader(fis, "GBK"); 
	        BufferedReader br = new BufferedReader(isr); 
	        //
			String line = "";
			int lines = 0;
			int actualNum = 0;
			int existedNum = 0;
			while((line=br.readLine())!=null){
				if(line.startsWith("<contenttitle>") || line.startsWith("<content>")){
					int index1 = line.indexOf(">");
					int index2 = line.lastIndexOf("<");
					line = line.substring(index1+1, index2);
					//
					if(bloomfilter.contains(line)==false){
						sentences.add(line);
						bloomfilter.put(line);
						actualNum++;
						//
						if(sentences.size() >=100000){
							//BufferedWriter bw = new BufferedWriter(new FileWriter(path+smallSens));
							FileOutputStream fos = new FileOutputStream(path+smallSens, true); 
					        OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
							for(int i=0; i<sentences.size(); i++){
								osw.write(sentences.get(i)+"\r\n");
							}
							osw.flush();
							osw.close();
							sentences.clear();
						}
					}
					else{
						existedNum++;
						//System.out.println(line);
					}
				}
				lines++;
				System.out.println("line : "+lines);
			}
			//
			if(sentences.size() >0){
				FileOutputStream fos = new FileOutputStream(path+smallSens, true); 
		        OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
				for(int i=0; i<sentences.size(); i++){
					osw.write(sentences.get(i)+"\r\n");
				}
				osw.flush();
				osw.close();
				sentences.clear();
			}
			//
			br.close();
			//
			System.out.println("(actualNum+existedNum)/totalNum"+" : "+actualNum+"+"+existedNum+"="+(actualNum+existedNum)+"/"+lines);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	 public static void main(String[] args){
		 ExtractSentences es = new ExtractSentences();
		 long start = System.currentTimeMillis();
		 es.extractSentences();
		 long end  = System.currentTimeMillis();
		 System.out.println("time: "+(end-start)+" ms");
	 }
}

实验结果：

数据源大小：1.43G

行数：7765398

新闻篇数：7765398/6 = 12988466

从中抽取到2588466行（标题or内容），去重之后有1636090行（重复的有952376行，占其58.2%），可见布隆过滤器还是起到很关键的作用。

抽取句子用时：100秒左右。

样例如下：

2.3.采用fudanNLP工具进行中文分词

fudanNLP：http://code.google.com/p/fudannlp/

分词较为简单。

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import edu.fudan.nlp.cn.tag.CWSTagger;
import edu.fudan.util.exception.LoadModelException;

/*
 * 2014年6月2日15:46:35
 * 对所有句子进行分词
 * @niuliqiang
 */
public class Segment {
	public String path = "E:\\数据\\Sogou数据\\web\\";
	public String sensPath = "sentences.txt";
	public String segsPath = "segment.txt";

	public void segment() {
		List<String> segments = new ArrayList<String>();
		try {
			FileInputStream fis = new FileInputStream(path + sensPath);
			InputStreamReader isr = new InputStreamReader(fis, "utf-8");
			BufferedReader br = new BufferedReader(isr);
			//
			String line = "";
			int lines = 0;
			CWSTagger tag = new CWSTagger( "./models/seg.m" );
			while((line= br.readLine())!=null){
				//
				String segs = tag.tag(line);			//分词结果 包含标点
				segments.add(segs);
				lines++;
				System.out.println("line : "+lines);
				if(segments.size() >10000){
					FileOutputStream fos = new FileOutputStream(path+segsPath, true); 
			        OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
					for(int i=0; i<segments.size(); i++){
						osw.write(segments.get(i)+"\r\n");
					}
					osw.flush();
					osw.close();
					segments.clear();
				}
			}
			if(segments.size() >0){
				FileOutputStream fos = new FileOutputStream(path+segsPath, true); 
		        OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
				for(int i=0; i<segments.size(); i++){
					osw.write(segments.get(i)+"\r\n");
				}
				osw.flush();
				osw.close();
				segments.clear();
			}
			//
			br.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (LoadModelException e) {
			e.printStackTrace();
		}
	}
	 public static void main(String[] args){
		 Segment seg = new Segment();
		 long start = System.currentTimeMillis();
		 seg.segment();
		 long end  = System.currentTimeMillis();
		 System.out.println("time: "+(end-start)/1000.0+" s");
	 }
}

实验结果：

句子行数（标题or新闻内容为一行）：1636090

txt文本大小：1.46G

用时：49.8分钟

分词之后txt文本大小：1.73G

样例如下（中间为空格）：