2014年5月31日14:49:21
1.英文word2vec:
1.1.下载/git clone/SVN获取word2vec
查看执行权限:ls -l *.sh
增加可执行权限:chmod a+x *.sh
1.2.运行./demo-word.sh下载英文样本数据text8.gz
解压gunzip -c text8.gz > text8
1.3.运行./demo-word.sh,执行word2vec训练:
之后执行命令./distance vectors.bin计算word之间的相似度:
输入france:
输入china:
1.4.运行./demo-class进行K-Means聚类:
1.5.编写脚本计算任意两word之间的相似度:
2.中文word2vec:
2.1.数据来源: http://www.sogou.com/labs/dl/ca.html
来自若干新闻站点2012年6月—7月期间国内,国际,体育,社会,娱乐等18个频道的新闻数据,提供URL和正文信息。
数据格式:
2.2.抽取有用的句子,即上述语料库中的标题和内容。通过布隆过滤器去重。
布隆过滤器:参考此文http://sobuhu.com/algorithm/2013/03/04/java-bloom-filter.html
布隆过滤器:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
public class BloomFilter {
private BitSet bitSet;
private int bitSetSize;
private int addedElements;
private int hashFunctionNumber;
/**
* 构造一个布隆过滤器,过滤器的容量为c * n 个bit.
* @param c 当前过滤器预先开辟的最大包含记录,通常要比预计存入的记录多一倍.
* @param n 当前过滤器预计所要包含的记录.
* @param k 哈希函数的个数,等同每条记录要占用的bit数.
*/
public BloomFilter(int c, int n, int k) {
this.hashFunctionNumber = k;
this.bitSetSize = (int) Math.ceil(c * k);
this.addedElements = n;
this.bitSet = new BitSet(this.bitSetSize);
}
/**
* 通过文件初始化过滤器.
* @param file
*/
public void init(String file) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while (line != null && line.length() > 0) {
this.put(line);
line = reader.readLine();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (reader != null) reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void put(String str) {
int[] positions = createHashes(str.getBytes(), hashFunctionNumber);
for (int i = 0; i < positions.length; i++) {
int position = Math.abs(positions[i] % bitSetSize);
bitSet.set(position, true);
}
}
public boolean contains(String str) {
byte[] bytes = str.getBytes();
int[] positions = createHashes(bytes, hashFunctionNumber);
for (int i : positions) {
int position = Math.abs(i % bitSetSize);
if (!bitSet.get(position)) {
return false;
}
}
return true;
}
/**
* 得到当前过滤器的错误率.
* @return
*/
public double getFalsePositiveProbability() {
// (1 - e^(-k * n / m)) ^ k
return Math.pow((1 - Math.exp(-hashFunctionNumber * (double) addedElements / bitSetSize)),
hashFunctionNumber);
}
/**
* 将字符串的字节表示进行多哈希编码.
* @param bytes 待添加进过滤器的字符串字节表示.
* @param hashNumber 要经过的哈希个数.
* @return 各个哈希的结果数组.
*/
public static int[] createHashes(byte[] bytes, int hashNumber) {
int[] result = new int[hashNumber];
int k = 0;
while (k < hashNumber) {
result[k] = HashFunctions.hash(bytes, k);
k++;
}
return result;
}
public static void main(String[] args) throws Exception {
BloomFilter bloomfilter = new BloomFilter(30000000, 10000000, 8);
System.out.println("Bloom Filter Initialize ... ");
bloomfilter.init("data/base.txt");
System.out.println("Bloom Filter Ready");
System.out.println("False Positive Probability : "
+ bloomfilter.getFalsePositiveProbability());
// 查找新数据
List<String> result = new ArrayList<String>();
long t1 = System.currentTimeMillis();
BufferedReader reader = new BufferedReader(new FileReader("data/input.txt"));
String line = reader.readLine();
while (line != null && line.length() > 0) {
if (!bloomfilter.contains(line)) {
result.add(line);
}
line = reader.readLine();
}
reader.close();
long t2 = System.currentTimeMillis();
//System.out.println("Parse 9900000 items, Time : " + (t2 - t1) + "ms , find " + result.size() + " new items.");
// System.out.println("Average : " + 9900000 / ((t2 - t1) / 1000) + " items/second");
}
}
class HashFunctions {
public static int hash(byte[] bytes, int k) {
switch (k) {
case 0:
return RSHash(bytes);
case 1:
return JSHash(bytes);
case 2:
return ELFHash(bytes);
case 3:
return BKDRHash(bytes);
case 4:
return APHash(bytes);
case 5:
return DJBHash(bytes);
case 6:
return SDBMHash(bytes);
case 7:
return PJWHash(bytes);
}
return 0;
}
public static int RSHash(byte[] bytes) {
int hash = 0;
int magic = 63689;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = hash * magic + bytes[i];
magic = magic * 378551;
}
return hash;
}
public static int JSHash(byte[] bytes) {
int hash = 1315423911;
for (int i = 0; i < bytes.length; i++) {
hash ^= ((hash << 5) + bytes[i] + (hash >> 2));
}
return hash;
}
public static int ELFHash(byte[] bytes) {
int hash = 0;
int x = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = (hash << 4) + bytes[i];
if ((x = hash & 0xF0000000) != 0) {
hash ^= (x >> 24);
hash &= ~x;
}
}
return hash;
}
public static int BKDRHash(byte[] bytes) {
int seed = 131;
int hash = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = (hash * seed) + bytes[i];
}
return hash;
}
public static int APHash(byte[] bytes) {
int hash = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
if ((i & 1) == 0) {
hash ^= ((hash << 7) ^ bytes[i] ^ (hash >> 3));
} else {
hash ^= (~((hash << 11) ^ bytes[i] ^ (hash >> 5)));
}
}
return hash;
}
public static int DJBHash(byte[] bytes) {
int hash = 5381;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = ((hash << 5) + hash) + bytes[i];
}
return hash;
}
public static int SDBMHash(byte[] bytes) {
int hash = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = bytes[i] + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
public static int PJWHash(byte[] bytes) {
long BitsInUnsignedInt = (4 * 8);
long ThreeQuarters = ((BitsInUnsignedInt * 3) / 4);
long OneEighth = (BitsInUnsignedInt / 8);
long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
int hash = 0;
long test = 0;
for (int i = 0; i < bytes.length; i++) {
hash = (hash << OneEighth) + bytes[i];
if ((test = hash & HighBits) != 0) {
hash = (int) ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return hash;
}
}
根据上面布隆过滤器来抽取搜狗数据集中的句子:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
/*
* 2014年6月2日14:37:58
* 从sougouCA 中抽取句子,去除重复。
* @niuliqiang
*/
public class ExtractSentences {
public String path = "E:\\数据\\Sogou数据\\web\\";
public String small = "news_tensite_xml.dat";
public String smallSens = "sentences.txt";
//
public BloomFilter bloomfilter = new BloomFilter(15000000, 5000000, 8); //1500W, 500W
public void extractSentences(){
List<String> sentences = new ArrayList<String>();
try {
//BufferedReader br = new BufferedReader(new FileReader(path+small));
FileInputStream fis = new FileInputStream(path+small);
InputStreamReader isr = new InputStreamReader(fis, "GBK");
BufferedReader br = new BufferedReader(isr);
//
String line = "";
int lines = 0;
int actualNum = 0;
int existedNum = 0;
while((line=br.readLine())!=null){
if(line.startsWith("<contenttitle>") || line.startsWith("<content>")){
int index1 = line.indexOf(">");
int index2 = line.lastIndexOf("<");
line = line.substring(index1+1, index2);
//
if(bloomfilter.contains(line)==false){
sentences.add(line);
bloomfilter.put(line);
actualNum++;
//
if(sentences.size() >=100000){
//BufferedWriter bw = new BufferedWriter(new FileWriter(path+smallSens));
FileOutputStream fos = new FileOutputStream(path+smallSens, true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
for(int i=0; i<sentences.size(); i++){
osw.write(sentences.get(i)+"\r\n");
}
osw.flush();
osw.close();
sentences.clear();
}
}
else{
existedNum++;
//System.out.println(line);
}
}
lines++;
System.out.println("line : "+lines);
}
//
if(sentences.size() >0){
FileOutputStream fos = new FileOutputStream(path+smallSens, true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
for(int i=0; i<sentences.size(); i++){
osw.write(sentences.get(i)+"\r\n");
}
osw.flush();
osw.close();
sentences.clear();
}
//
br.close();
//
System.out.println("(actualNum+existedNum)/totalNum"+" : "+actualNum+"+"+existedNum+"="+(actualNum+existedNum)+"/"+lines);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args){
ExtractSentences es = new ExtractSentences();
long start = System.currentTimeMillis();
es.extractSentences();
long end = System.currentTimeMillis();
System.out.println("time: "+(end-start)+" ms");
}
}
实验结果:
数据源大小:1.43G
行数:7765398
新闻篇数:7765398/6 = 12988466
从中抽取到2588466行(标题or内容),去重之后有1636090行(重复的有952376行,占其58.2%),可见布隆过滤器还是起到很关键的作用。
抽取句子用时:100秒左右。
样例如下:
2.3.采用fudanNLP工具进行中文分词
fudanNLP:http://code.google.com/p/fudannlp/
分词较为简单。
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import edu.fudan.nlp.cn.tag.CWSTagger;
import edu.fudan.util.exception.LoadModelException;
/*
* 2014年6月2日15:46:35
* 对所有句子进行分词
* @niuliqiang
*/
public class Segment {
public String path = "E:\\数据\\Sogou数据\\web\\";
public String sensPath = "sentences.txt";
public String segsPath = "segment.txt";
public void segment() {
List<String> segments = new ArrayList<String>();
try {
FileInputStream fis = new FileInputStream(path + sensPath);
InputStreamReader isr = new InputStreamReader(fis, "utf-8");
BufferedReader br = new BufferedReader(isr);
//
String line = "";
int lines = 0;
CWSTagger tag = new CWSTagger( "./models/seg.m" );
while((line= br.readLine())!=null){
//
String segs = tag.tag(line); //分词结果 包含标点
segments.add(segs);
lines++;
System.out.println("line : "+lines);
if(segments.size() >10000){
FileOutputStream fos = new FileOutputStream(path+segsPath, true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
for(int i=0; i<segments.size(); i++){
osw.write(segments.get(i)+"\r\n");
}
osw.flush();
osw.close();
segments.clear();
}
}
if(segments.size() >0){
FileOutputStream fos = new FileOutputStream(path+segsPath, true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
for(int i=0; i<segments.size(); i++){
osw.write(segments.get(i)+"\r\n");
}
osw.flush();
osw.close();
segments.clear();
}
//
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (LoadModelException e) {
e.printStackTrace();
}
}
public static void main(String[] args){
Segment seg = new Segment();
long start = System.currentTimeMillis();
seg.segment();
long end = System.currentTimeMillis();
System.out.println("time: "+(end-start)/1000.0+" s");
}
}
实验结果:
句子行数(标题or新闻内容为一行):1636090
txt文本大小:1.46G
用时:49.8分钟
分词之后txt文本大小:1.73G
样例如下(中间为空格):
2.4.随机一个小的已经分好词的文件块来测试word2vec:
segment1.txt文件块大小:42.3M
利用word2vec来训练,编写shell脚本:mydemo-word.sh
如下图:输入文件为segment1.txt,输入为seg1out.txt,训练完成之后调用distance。
一些结果:
调用distance:
输入 装备:
输入 汽车:
输入 乌克兰:
输入 比赛:
K-Means聚类:
在小数据集上的实验已经可以看出一些端倪,我们期待在1.73G的数据集上有更好的结果。
2.5.大数据集上word2vec实验:
所有词组数量:689668,
训练时间:1小时多
调用distance结果:
OK,发现问题,很多的词受到新闻数据集里面的数字、分词错误等的很大影响,导致最后的结果不佳!
跟前面小数据集对比:
总结来看,大数据集上各种分词错误、数字等噪音影响很大,导致一部分的结果偏差太大!另外,小数据集噪音较小,结果在其数据集内还可以。对于不受噪音影响的词,大数据集下的结果更加好!如“子孙”等。
2.6.下一步工作
下一步准备在分词工具上改进,以及去除噪音!再来看结果如何。
纯属兴趣~