使用 DL4J 训练中文词向量
1 预处理
对中文语料的预处理,主要包括:分词、去停用词以及一些根据实际场景制定的规则。
package ai.mole.test;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.library.Library;
import java.io.*;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;
public class Preprocess {
private static final Pattern NUMERIC_PATTERN = Pattern.compile("^[.\\d]+$");
private static final Pattern ENGLISH_WORD_PATTERN = Pattern.compile("^[a-z]+$");
public static void main(String[] args) {
String inPath1 = "D:\\MyData\\XUGP3\\Desktop\\测试分词\\test1.txt";
String inPath2 = "D:\\MyData\\XUGP3\\Desktop\\测试分词\\stop_words.txt";
String outPath = "D:\\MyData\\XUGP3\\Desktop\\测试分词\\result1.txt";
String encoding = "utf-8";
PrintWriter writer = null;
Forest forest = null;
try {
writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPath), encoding));
forest = Library.makeForest(Test.class.getResourceAsStream("/library/userLibrary.dic"));
List lineList = IOUtil.readLines(new FileInputStream(inPath1), encoding);
List stopWordList = IOUtil.readLines(new FileInputStream(inPath2), encoding);
for (String line