之前实现的二分搜索树是不会存放重复元素的。因此可以实现集合的底层数据结构。
在此之前我们新建一个FileOperation类,来导入文件。
import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Locale; import java.util.Scanner; // 文件相关操作 public class FileOperation { // 读取文件名称为filename中的内容,并将其中包含的所有词语放进words中 public static boolean readFile(String filename, ArrayList<String> words){ if (filename == null || words == null){ System.out.println("filename is null or words is null"); return false; } // 文件读取 Scanner scanner; try { File file = new File(filename); if(file.exists()){ FileInputStream fis = new FileInputStream(file); scanner = new Scanner(new BufferedInputStream(fis), "UTF-8"); scanner.useLocale(Locale.ENGLISH); } else return false; } catch(IOException ioe){ System.out.println("Cannot open " + filename); return false; } // 简单分词 // 这个分词方式相对简陋, 没有考虑很多文本处理中的特殊问题 // 在这里只做demo展示用 if (scanner.hasNextLine()) { String contents = scanner.useDelimiter("\\A").next(); int start = firstCharacterIndex(contents, 0); for (int i = start + 1; i <= contents.length(); ) if (i == contents.length() || !Character.isLetter(contents.charAt(i))) { String word = contents.substring(start, i).toLowerCase(); words.add(word); start = firstCharacterIndex(contents, i); i = start + 1; } else i++; } return true; } // 寻找字符串s中,从start的位置开始的第一个字母字符的位置 private static int firstCharacterIndex(String s, int start){ for( int i = start ; i < s.length() ; i ++ ) if( Character.isLetter(s.charAt(i)) ) return i; return s.length(); } }
先设置public的抽象类
public interface Set<E> { void add(E e); boolean contains(E e); void remove(E e); int getSize(); boolean isEmpty(); }
基于链表的set
import java.util.ArrayList; public class LinkedListSet<E> implements Set<E>{ private LinkedList<E> linkedList; public LinkedListSet(){ linkedList = new LinkedList<>(); } @Override public void add(E e) { if (! contains(e)){ linkedList.addFirst(e); } } @Override public boolean contains(E e) { return linkedList.contains(e); } @Override public void remove(E e) { linkedList.removeElement(e); } @Override public int getSize() { return linkedList.getSize(); } @Override public boolean isEmpty() { return linkedList.isEmpty(); } public static void main(String[] args) { System.out.println("Pride and Prejudice"); ArrayList<String> words1 = new ArrayList<>(); if(FileOperation.readFile("pride-and-prejudice.txt", words1)){ System.out.println("Total words: " + words1.size()); LinkedListSet<String> set1 = new LinkedListSet<>(); for(String word: words1) set1.add(word); System.out.println("Total different words: " + set1.getSize()); } System.out.println(); System.out.println("A Tale of Two Cities"); ArrayList<String> words2 = new ArrayList<>(); if(FileOperation.readFile("a-tale-of-two-cities.txt", words2)){ System.out.println("Total words: " + words2.size()); LinkedListSet<String> set2 = new LinkedListSet<>(); for(String word: words2) set2.add(word); System.out.println("Total different words: " + set2.getSize()); } } }
运行结果:
Pride and Prejudice
Total words: 125901
Total different words: 6531
A Tale of Two Cities
Total words: 141489
Total different words: 9945
至于英文文档,推荐一个外文搜书网站,链接是https://www.jiumodiary.com/。
基于二分搜索树的set(之前的二分搜索树add方法写的有偏差,已改)
import java.util.ArrayList; public class BSTSet<E extends Comparable<E>> implements Set<E>{ private BST<E> bst; public BSTSet(){ bst = new BST(); } @Override public void add(E e) { bst.add(e); } @Override public boolean contains(E e) { return bst.contians(e); } @Override public void remove(E e) { bst.remove(e); } @Override public int getSize() { return bst.getSize(); } @Override public boolean isEmpty() { return bst.isEmpty(); } public static void main(String[] args) { System.out.println("Pride and Prejudice"); ArrayList<String> words1 = new ArrayList<>(); if(FileOperation.readFile("pride-and-prejudice.txt", words1)){ System.out.println("Total words: " + words1.size()); BSTSet<String> set1 = new BSTSet<>(); for(String word: words1) set1.add(word); System.out.println("Total different words: " + set1.getSize()); } System.out.println(); System.out.println("A Tale of Two Cities"); ArrayList<String> words2 = new ArrayList<>(); if(FileOperation.readFile("a-tale-of-two-cities.txt", words2)){ System.out.println("Total words: " + words2.size()); BSTSet<String> set2 = new BSTSet<>(); for(String word: words2) set2.add(word); System.out.println("Total different words: " + set2.getSize()); } } }Pride and Prejudice
Total words: 125901
Total different words: 6530
A Tale of Two Cities
Total words: 141489
Total different words: 9944
基于avl树的set
import java.util.ArrayList; public class AVLSet<E extends Comparable<E>> implements Set<E>{ private AVLtree<E, Object> avLtree; public AVLSet(){ avLtree = new AVLtree<>(); } @Override public void add(E e) { avLtree.add(e, null); } @Override public boolean contains(E e) { return avLtree.contains(e); } @Override public void remove(E e) { avLtree.remove(e); } @Override public int getSize() { return avLtree.getSize(); } @Override public boolean isEmpty() { return avLtree.isEmpty(); } public static void main(String[] args) { System.out.println("Pride and Prejudice"); ArrayList<String> words1 = new ArrayList<>(); if(FileOperation.readFile("pride-and-prejudice.txt", words1)){ System.out.println("Total words: " + words1.size()); AVLSet<String> set1 = new AVLSet<>(); for(String word: words1) set1.add(word); System.out.println("Total different words: " + set1.getSize()); } System.out.println(); System.out.println("A Tale of Two Cities"); ArrayList<String> words2 = new ArrayList<>(); if(FileOperation.readFile("a-tale-of-two-cities.txt", words2)){ System.out.println("Total words: " + words2.size()); AVLSet<String> set2 = new AVLSet<>(); for(String word: words2) set2.add(word); System.out.println("Total different words: " + set2.getSize()); } }Pride and Prejudice
Total words: 125901
Total different words: 6530
A Tale of Two Cities
Total words: 141489
Total different words: 9944
接下来,我们对这三个基于不同底层实现的set集合进行测试。
import java.util.ArrayList; public class TestSetMain { private static double testSet(Set<String> set, String filename){ long startTime = System.nanoTime(); System.out.println(filename); ArrayList<String> words = new ArrayList<>(); if(FileOperation.readFile(filename, words)) { System.out.println("Total words: " + words.size()); for (String word : words) set.add(word); System.out.println("Total different words: " + set.getSize()); } long endTime = System.nanoTime(); return (endTime - startTime) / 1000000000.0; } public static void main(String[] args) { String filename = "pride-and-prejudice.txt"; BSTSet<String> bstSet = new BSTSet<>(); double time1 = testSet(bstSet, filename); System.out.println("BST Set: " + time1 + " s"); System.out.println(); LinkedListSet<String> linkedListSet = new LinkedListSet<>(); double time2 = testSet(linkedListSet, filename); System.out.println("Linked List Set: " + time2 + " s"); System.out.println(); AVLSet<String> avlSet = new AVLSet<>(); double time3 = testSet(avlSet, filename); System.out.println("AVL Set: " + time3 + " s"); } }
测试结果:
pride-and-prejudice.txt
Total words: 125901
Total different words: 6530
BST Set: 0.290137957 s
Total words: 125901
Total different words: 6531
Linked List Set: 4.79940087 s
Total words: 125901
Total different words: 6530
AVL Set: 0.10533812 s
这充分说明了avl平衡二叉树在性能上的优势