普林斯顿 算法第四版
本文的代码以及之前的作业代码可通过一下github链接获得
https://github.com/Changjing-Liu/algorithm_lab
引言
作业要求链接:
https://coursera.cs.princeton.edu/algs4/assignments/wordnet/specification.php
作业常见问题解答:
https://coursera.cs.princeton.edu/algs4/assignments/wordnet/faq.php
本次作业需要完成一个wordnet,即英语语义词典,需要完成(按照作业常见问题解答中老师推荐的顺序):
- SAP.java
- WordNet.java
- Outcast.java
一、SPA.java
这一部分可以偷懒,采用algs4里的BreadthFirstDirectedPaths()来实现广度优先搜索,以此判断两点的路径是否存在,并确定两点间的距离。
import edu.princeton.cs.algs4.BreadthFirstDirectedPaths;
import edu.princeton.cs.algs4.Digraph;
import edu.princeton.cs.algs4.In;
import edu.princeton.cs.algs4.StdIn;
import edu.princeton.cs.algs4.StdOut;
public class SAP {
private Digraph G;
private int anc = -1;
// constructor takes a digraph (not necessarily a DAG)
public SAP(Digraph G) {
if (G == null) {
throw new IllegalArgumentException();
} else {
this.G = new Digraph(G);
}
}
// length of shortest ancestral path between v and w; -1 if no such path
public int length(int v, int w) {
if (v < 0 || v > G.V() - 1 || w < 0 || w > G.V() - 1)
throw new IllegalArgumentException();
anc = -1;
BreadthFirstDirectedPaths bv = new BreadthFirstDirectedPaths(G, v);
BreadthFirstDirectedPaths bw = new BreadthFirstDirectedPaths(G, w);
int minlength = Integer.MAX_VALUE;
for (int i = 0; i < G.V(); i++) {
if (bv.hasPathTo(i) && bw.hasPathTo(i)) {
int l = bv.distTo(i) + bw.distTo(i);
if (l < minlength) {
minlength = l;
anc = i;
}
}
}
if (minlength == Integer.MAX_VALUE) return -1;
else return minlength;
}
// a common ancestor of v and w that participates in a shortest ancestral path; -1 if no such path
public int ancestor(int v, int w) {
length(v, w);
return anc;
}
// length of shortest ancestral path between any vertex in v and any vertex in w; -1 if no such path
public int length(Iterable<Integer> v, Iterable<Integer> w) {
int length = Integer.MAX_VALUE;
int temp;
for (Integer i : v) {
for (Integer j : w) {
temp = length(i, j);
if (temp < length)
length = temp;
}
}
if (length == Integer.MAX_VALUE) return -1;
else return length;
}
// a common ancestor that participates in shortest ancestral path; -1 if no such path
public int ancestor(Iterable<Integer> v, Iterable<Integer> w) {
int length = Integer.MAX_VALUE;
int local_anc = -1;
int temp;
for (Integer i : v) {
for (Integer j : w) {
temp = length(i, j);
if (temp < length) {
length = temp;
local_anc = this.anc;
}
}
}
if (length == Integer.MAX_VALUE) return -1;
else return local_anc;
}
public static void main(String[] args) {
In in = new In(".\\test\\digraph2.txt");
Digraph G = new Digraph(in);
SAP sap = new SAP(G);
while (!StdIn.isEmpty()) {
int v = StdIn.readInt();
int w = StdIn.readInt();
int length = sap.length(v, w);
int ancestor = sap.ancestor(v, w);
StdOut.printf("length = %d, ancestor = %d\n", length, ancestor);
}
}
}
二、WordNet.java
该部分都先需要对输入的数据格式进行分析处理,并采用合理的数据结构存储。此处基于ST<String, Bag>实现word到对应同义词集合的映射,基于ArrayList实现word的链式存储,基于Digraph构建wordnet的有向图
需要注意的一点是,构建有向图后,需要判断是否合法:即是否存在环,或是否存在多个根节点
有环可通过algs4中的DirectedCycle来判断,是否有多个根节点可通过记录v->w映射中的v,通过遍历所有节点,使用排除法判断未记录的节点数量是否超过1来判断。
import edu.princeton.cs.algs4.Bag;
import edu.princeton.cs.algs4.Digraph;
import edu.princeton.cs.algs4.DirectedCycle;
import edu.princeton.cs.algs4.In;
import edu.princeton.cs.algs4.ST;
import edu.princeton.cs.algs4.StdOut;
import java.util.ArrayList;
public class WordNet {
private ST<String, Bag<Integer>> st;
private Digraph G;
private ArrayList<String> idList;
// constructor takes the name of the two input files
public WordNet(String synsets, String hypernyms) {
st = new ST<String, Bag<Integer>>();
idList = new ArrayList<String>();
int maxvertex = 0;
In inSynsets = new In(synsets);
while (inSynsets.hasNextLine()) {
maxvertex++;
String[] line_split = inSynsets.readLine().split(","); // ,用于分割 id、synset、gloss
Integer id = Integer.parseInt(line_split[0]);
String[] nouns = line_split[1].split(" "); // 空格用于分割同义词单词,a2存放单词
idList.add(line_split[1]);
for (String noun : nouns) {
//StdOut.printf(noun);
if (st.contains(noun)) {
st.get(noun).add(id);
} else {
Bag<Integer> b = new Bag<Integer>();
b.add(id);
st.put(noun, b);
}
}
}
In inHypernyms = new In(hypernyms);
boolean[] isNotRoot = new boolean[maxvertex];
int rootnum = 0;
G = new Digraph(maxvertex);
while (inHypernyms.hasNextLine()) {
String[] line_split = inHypernyms.readLine().split(",");
int id = Integer.parseInt(line_split[0]);
isNotRoot[id] = true;
//String[] nums = line_split[1].split(" ");
int len = line_split.length;
for (int i = 1; i < len; i++) {
int num_id = Integer.parseInt(line_split[i]);
G.addEdge(id, num_id);
//StdOut.printf(num);
}
}
for (int i = 0; i < maxvertex; i++) {
if (!isNotRoot[i]) rootnum++;
}
//判断是否有环//判断是否有两个以上的节点
DirectedCycle cylinder = new DirectedCycle(G);
if (cylinder.hasCycle() || rootnum >= 2) {
throw new IllegalArgumentException("root number is larger than 2");
}
}
// returns all WordNet nouns
public Iterable<String> nouns() {
return st.keys();
}
// is the word a WordNet noun?
public boolean isNoun(String word) {
if (word == null) {
throw new IllegalArgumentException("no word");
} else {
return st.contains(word);
}
}
// distance between nounA and nounB (defined below)
public int distance(String nounA, String nounB) {
if (nounA == null || nounB == null) {
throw new java.lang.IllegalArgumentException("the word is null");
}
if (!isNoun(nounA))
throw new java.lang.IllegalArgumentException("the String nounA is no in WordNet");
if (!isNoun(nounB))
throw new java.lang.IllegalArgumentException("the String nounB is no in WordNet");
Bag<Integer> valueA = st.get(nounA);
Bag<Integer> valueB = st.get(nounB);
SAP s = new SAP(G);
return s.length(valueA, valueB);
}
// a synset (second field of synsets.txt) that is the common ancestor of nounA and nounB
// in a shortest ancestral path (defined below)
public String sap(String nounA, String nounB) {
if (nounA == null || nounB == null) {
throw new java.lang.IllegalArgumentException("the word is null");
}
if (!isNoun(nounA))
throw new java.lang.IllegalArgumentException("the String nounA is no in WordNet");
if (!isNoun(nounB))
throw new java.lang.IllegalArgumentException("the String nounB is no in WordNet");
Bag<Integer> valueA = st.get(nounA);
Bag<Integer> valueB = st.get(nounB);
SAP s = new SAP(G);
int id = s.ancestor(valueA, valueB);
return idList.get(id);
}
// do unit testing of this class
public static void main(String[] args) {
String in1 = ".\\test\\synsets.txt";
String in2 = ".\\test\\hypernyms.txt";
WordNet net1 = new WordNet(in1, in2);
StdOut.print(net1.sap("Adam", "Acre"));
}
}
三、Outcast.java
按照要求找到word集合中,某一word的最大距离,并后返回该word。实现较为简单。
import edu.princeton.cs.algs4.In;
import edu.princeton.cs.algs4.StdOut;
public class Outcast {
private final WordNet wordnet;
public Outcast(WordNet wordnet) // constructor takes a WordNet object
{
this.wordnet = wordnet;
}
public String outcast(String[] nouns) // given an array of WordNet nouns, return an outcast
{
int maxdistance = Integer.MIN_VALUE;
String max_word = nouns[0];
for (String noun1 : nouns) {
int tmpdistance = 0;
for (String noun2 : nouns) {
if (!noun1.equals(noun2)) {
tmpdistance += this.wordnet.distance(noun1, noun2);
}
}
if (tmpdistance > maxdistance) {
maxdistance = tmpdistance;
max_word = noun1;
}
}
return max_word;
}
public static void main(String[] args) {
String[] a = new String[5];
a[0] = ".\\test\\synsets.txt";
a[1] = ".\\test\\hypernyms.txt";
a[2] = ".\\test\\outcast5.txt";
a[3] = ".\\test\\outcast8.txt";
a[4] = ".\\test\\outcast11.txt";
WordNet wordnet = new WordNet(a[0], a[1]);
Outcast outcast = new Outcast(wordnet);
for (int t = 2; t < a.length; t++) {
In in = new In(a[t]);
String[] nouns = in.readAllStrings();
StdOut.println(a[t] + ": " + outcast.outcast(nouns));
}
}
}
四、总结
本次作业最终获得88/100,存在一些超时的问题。