import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
public class ChnSeq {
private TireNode tire = null;
public void init() throws IOException, ClassNotFoundException {
File file = new File("data" + File.separator + "dict.txt");
if (!file.isFile()) {
System.err.println("语料库不存在!终止程序!");
System.exit(0);
}
BufferedReader in = new BufferedReader(
new InputStreamReader(new FileInputStream(file), "utf-8"));
String line = in.readLine();
int totalFreq = Integer.parseInt(line);
tire = new TireNode();
while ((line = in.readLine()) != null) {
String[] segs = line.split(" ");
String word = segs[0];
int freq = Integer.parseInt(segs[1]);
TireNode root = tire;
for (int i = 0; i < word.length(); i++) {
String c = "" + word.charAt(i);
TireNode node = root.getChild(c);
if (node == null) {
node = new TireNode();
node.setCharacter(c);
root.addChild(node);
}
root = node;
}
root.setFrequency(freq);
root.setAntilog(Math.log((double)totalFreq / freq));
}
in.close();
}
public TireNode getTire() {
return tire;
}
public TireNode getNodeByWord(String word) {
if (tire == null) {
System.err.println("需要先初始化ChnSeq对象!");
return null;
}
TireNode node = tire;
for (int i = 0; i < word.length(); i++) {
String ch = word.charAt(i) + "";
if (node == null) {
break;
} else {
node = node.getChild(ch);
}
}
return node;
}
private class Segment {
public String word;
public String endChar;
public String lastChar;
public double cost;
public final static String START_SIGN = "<< STARTING >>";
public final static String END_SIGN = "<< ENDING >>";
}
private List<Segment> preSegment(String sentence) {
List<Segment> segs = new ArrayList<Segment>();
Segment terminal = new Segment();
terminal.word = Segment.START_SIGN;
terminal.endChar = Segment.START_SIGN;
terminal.lastChar = null;
segs.add(terminal);
for (int i = 0; i < sentence.length(); i++) {
for (int j = i + 1; j <= sentence.length(); j++) {
String word = sentence.substring(i, j);
TireNode tnode = this.getNodeByWord(word);
if (tnode == null) {
break;
}
if (tnode.getFrequency() <= 0) {
continue;
}
Segment seg = new Segment();
seg.word = word;
seg.endChar = word.substring(word.length() - 1, word.length());
if (i == 0) {
seg.lastChar = Segment.START_SIGN;
} else {
seg.lastChar = sentence.substring(i - 1, i);
}
seg.cost = tnode.getAntilog();
segs.add(seg);
}
}
terminal = new Segment();
terminal.word = Segment.END_SIGN;
terminal.endChar = Segment.END_SIGN;
terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
segs.add(terminal);
return segs;
}
private String[] dynamicSegment(List<Segment> segs) {
final double INFINITE = 9999999;
if (segs == null || segs.size() == 0) {
return null;
}
int n = segs.size();
double[][] costs = new double[n][n];
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
costs[i][j] = INFINITE;
}
}
for (int i = 0; i < n; i++) {
String endChar = segs.get(i).endChar;
for (int j = 0; j < n; j++) {
String lastChar = segs.get(j).lastChar;
if (lastChar != null && lastChar.equals(endChar)) {
costs[i][j] = segs.get(j).cost;
}
}
}
int sp = 0; // starting point
int fp = n - 1; // finishing point
double[] dist = new double[n];
List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
List<Integer> list = new ArrayList<Integer>();
for (int i = 0; i < n; i++) {
dist[i] = costs[sp][i];
if (sp != i) {
list.add(i);
}
if (dist[i] < INFINITE) {
List<Integer> spa = new ArrayList<Integer>();
sPaths.add(spa);
} else {
sPaths.add(null);
}
}
while (!list.isEmpty()) {
Integer minIdx = list.get(0);
for (int i: list) {
if (dist[i] < dist[minIdx]) {
minIdx = i;
}
}
list.remove(minIdx);
for (int i = 0; i < n; i++) {
if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
dist[i] = dist[minIdx] + costs[minIdx][i];
List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
tmp.add(minIdx);
sPaths.set(i, tmp);
}
}
}
String[] result = new String[sPaths.get(fp).size()];
for (int i = 0; i < sPaths.get(fp).size(); i++) {
result[i] = segs.get(sPaths.get(fp).get(i)).word;
}
return result;
}
public String[] segment(String sentence) {
return dynamicSegment(preSegment(sentence));
}
}
import java.io.IOException;
public class Main {
public static void main(String[] args) throws ClassNotFoundException, IOException {
ChnSeq cs = new ChnSeq();
cs.init();
String sentence = "生活的决定权也一直都在自己手上";
String[] segs = cs.segment(sentence);
for (String s: segs) {
System.out.print(s + "\t");
}
}
}
333333333333333333333
最新推荐文章于 2023-04-27 17:51:45 发布