一、堆排序
堆排序见之前的一篇博客 http://blog.csdn.NET/zhutulang/article/details/7746033
二、top k 问题
有一千万条短信,其中有很多重复的。要求找出重复最多的10条短信。
在这里不考虑资源受限的情况,比较经典的解法是构造一个10个元素的最小堆,然后将剩余的元素挨个入堆,取代堆顶元素(入堆后执行下沉操作以保持堆序性质)。另外,也不考虑重复次数的问题。事实上这个问题看似简单,可是一旦加上各种限制条件延伸开,恐怕是可以很复杂很复杂的。
首先我们构造一个含有一千万条短信的文件。
package com.my.test5;
import java.io.*;
import java.util.UUID;
/**
* Title: 构造一个有一千万条短信的文本文件,每行一条短信<br/>
* Intention: <br/>
* <p>
* Class Name: com.my.test5.BuildTxtMsgFile<br/>
* Create Date: 2017/6/30 0:56 <br/>
* Project Name: MyTest <br/>
* Company: All Rights Reserved. <br/>
* Copyright © 2017 <br/>
* </p>
* <p>
* author: GaoWei <br/>
* 1st_examiner: <br/>
* 2nd_examiner: <br/>
* </p>
*
* @version 1.0
* @since JDK 1.7
*/
public class BuildTxtMsgFile {
public static void main(String[] args) throws Exception{
//以下是9百万条重复短信
MsgObj[] msgArr = {
new MsgObj("明天还要早起呢!",390000),//第9多
new MsgObj("帮我带早餐。",200000),
new MsgObj("人工智能真的会统治世界吗?",200000),
new MsgObj("hello java",700000), //第3多
new MsgObj("听说女神离职了?",3000000), //第1多
new MsgObj("九月九日忆山东兄弟",430000),//第7多
new MsgObj("山不在高有仙则灵",610000),//第4多
new MsgObj("生于忧患死于安乐啊!!!",200000),
new MsgObj("性格的培养至关重要",600000),//第5多
new MsgObj("天龙八部",570000),//第6多
new MsgObj("书剑恩仇录",200000),
new MsgObj("做任何事情都是有技巧的",1000000),//第2多
new MsgObj("吾日三省吾身", 400000),//第8多
new MsgObj("在人间",200000),
new MsgObj("三千越甲可吞吴!",300000)//第10多
};
long start = System.currentTimeMillis();
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("E:\\tmp\\msg.txt")), "UTF-8"));
//1、写入9百万条重复短信内容
for(MsgObj obj:msgArr){
for(int i=0;i<obj.getCount();i++){
bw.write(obj.getContent()+System.lineSeparator());
}
}
//2、写入1百万条随机内容短信
for (int i=0;i<1000000;i++) {
bw.write(UUID.randomUUID()+System.lineSeparator());
}
if (bw != null) {
bw.flush();
bw.close();
}
long end = System.currentTimeMillis();
System.out.println("写入完成,耗时:"+(end - start));
}
/**
* 短信model,包含短信内容和即将写入文件的次数
*/
public static class MsgObj {
private String content;
private int count;
public MsgObj(String content, int count) {
this.content = content;
this.count = count;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
}
}
堆节点元素定义:
package com.my.test5;
/**
* Title: 堆节点元素,存储短信内容和次数<br/>
* Intention: <br/>
* <p>
* Class Name: com.my.test5.MsgModel<br/>
* Create Date: 2017/7/3 0:34 <br/>
* Project Name: MyTest <br/>
* Company: All Rights Reserved. <br/>
* Copyright © 2017 <br/>
* </p>
* <p>
* author: GaoWei <br/>
* 1st_examiner: <br/>
* 2nd_examiner: <br/>
* </p>
*
* @version 1.0
* @since JDK 1.7
*/
public class MsgModel implements Comparable<MsgModel>{
/**
* 短信内容
*/
private String content;
/**
* 次数
*/
private int count;
public MsgModel(){
}
public MsgModel(String content, int count) {
this.content = content;
this.count = count;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
public int compareTo(MsgModel o) {
return count - o.getCount();
}
@Override
public String toString() {
return "[content="+content+", count="+count+"]";
}
}
find top K:
package com.my.test5;
import com.my.test6.MinHeap;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
/**
* Title: <br/>
* Intention: <br/>
* <p>
* Class Name: com.my.test5.FindTxsMsg<br/>
* Create Date: 2017/6/30 21:19 <br/>
* Project Name: MyTest <br/>
* Company: All Rights Reserved. <br/>
* Copyright © 2017 <br/>
* </p>
* <p>
* author: GaoWei <br/>
* 1st_examiner: <br/>
* 2nd_examiner: <br/>
* </p>
*
* @version 1.0
* @since JDK 1.7
*/
public class FindTxtMsg {
public static void main(String[] args) throws Exception{
findTop(10);
}
public static void findTop(int k) throws Exception {
long start = System.currentTimeMillis();
//用map统计短信内容和次数
Map<String,Integer> map = new HashMap<String, Integer>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("E:\\tmp\\msg.txt")));
String msg;
while((msg = br.readLine()) != null) {
if (map.containsKey(msg)) {
map.put(msg, map.get(msg)+1);
} else {
map.put(msg, 1);
}
}
long end = System.currentTimeMillis();
System.out.println("map统计完成,耗时="+(end - start));
System.out.println("map统计完成,map大小="+map.size());
//Map内容中前k个内容转换为MsgModel list,构造最小堆,寻找次数最大的k个短信
start = System.currentTimeMillis();
ArrayList<MsgModel> list = new ArrayList<MsgModel>(k);
MinHeap<MsgModel> myMinHeap = null;
MsgModel myMinHeapTop;
int i = 0;
for (Map.Entry<String, Integer> entry : map.entrySet()) {
if (++i <= k){
list.add(new MsgModel(entry.getKey(), entry.getValue()));
if (i == k) {
//构造元素个数为k的最小堆
myMinHeap = new MinHeap<MsgModel>(list);
}
} else {
//剩余的元素,如果比堆顶元素大,则取代堆顶元素
myMinHeapTop = (MsgModel) myMinHeap.top();
if (entry.getValue() > myMinHeapTop.getCount()) {
myMinHeap.replaceTop(new MsgModel(entry.getKey(), entry.getValue()));
}
}
}
end = System.currentTimeMillis();
//打印结果
System.out.println(myMinHeap);
System.out.println("find top k 完成,耗时="+(end - start));
}
}
运行结果:
map统计完成,耗时=2635
map统计完成,map大小=1000015
[[content=三千越甲可吞吴!, count=300000], [content=九月九日忆山东兄弟, count=430000], [content=明天还要早起呢!, count=390000], [content=性格的培养至关重要, count=600000], [content=天龙八部, count=570000], [content=吾日三省吾身, count=400000], [content=hello java, count=700000], [content=山不在高有仙则灵, count=610000], [content=听说女神离职了?, count=3000000], [content=做任何事情都是有技巧的, count=1000000]]
find top k 完成,耗时=57
当然,如果你愿意的话,可以给最后的结果排下序。
三、优先队列
可以查看jdk的PriorityQueue源码,实际上就是用最小堆实现的