原始数据模拟
单个线程产生数据
- 数据格式,每个数字8BYTE,换行符\r\n 2BYTE
File file = new File("test.txt");
FileWriter fileWriter = new FileWriter(file, false);
int i=1;
Random random = new Random();
while (i<=Threshould){
int i1 = random.nextInt(Threshould) ;
String s = String.format("%08d", i1) + "\r\n";
fileWriter.write(s);
i++;
}
fileWriter.close();
System.out.println("size ="+file.length());
多个线程产生数据(每个线程负责文件的一部分)
public static void writeP() throws IOException, InterruptedException {
File file = new File("testP.txt");
if(file.isFile()) {
file.delete();
}
RandomAccessFile randomAccessFile = new RandomAccessFile(file, "rw");
FileChannel channel = randomAccessFile.getChannel();
ExecutorService executorService = Executors.newFixedThreadPool(java.lang.Runtime.getRuntime().availableProcessors());
long position=0;
long size=(Threshould*10)/Rounds;
long num=Threshould/Rounds;
for(int i=0;i<Rounds;i++){
long finalPosition = position;
executorService.submit(new Runnable() {
@Override
public void run() {
try {
Random random = new Random();
MappedByteBuffer map = channel.map(FileChannel.MapMode.READ_WRITE, finalPosition, size);
for (int i=0;i<num;i++) {
int i1 = random.nextInt(Threshould);
String s = String.format("%08d", i1) + "\r\n";
map.put(s.getBytes());
}
} catch (IOException e) {
e.printStackTrace();
}
}
});
position+=size;
}
executorService.shutdown();
while (!executorService.awaitTermination(10, TimeUnit.MILLISECONDS)) {
}
channel.close();
System.out.println("size ="+file.length());
}
筛选1亿随机数最大的100个数
- 使用原始数据材料
SPSC - 单生产者单消费者模式
public void findMax0(String path,int MaxNum) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new FileReader(new File(path)));
int[] maxNum = new int[MaxNum];
String s="";int i=0;
while ((s!=null)){
s = bufferedReader.readLine();
if (s==null){
System.out.println("Count of Num: "+i);
break;
}
Integer integer = Integer.valueOf(s);
insert(maxNum,integer);
i++;
}
System.out.println("maxNum:");
for (int a:maxNum){
System.out.print(a+" - ");
}
}
SPMC - 单生产者多消费者模式
public void findMaxBySPMC(String path,int MaxNum) throws IOException, InterruptedException {
BufferedReader bufferedReader = new BufferedReader(new FileReader(new File(path)));
String s="";int i=0;
int[] last=new int[RoundCount];
int[] MaxAll=new int[MaxNum];
AtomicInteger Position = new AtomicInteger(0);
ExecutorService executorService = Executors.newFixedThreadPool(java.lang.Runtime.getRuntime().availableProcessors());
HashMap<String, Long> hashMap = new HashMap<>();
for(int K=1;K<=12;K++){
String threadName="pool-1-thread-"+K;
hashMap.put(threadName,0L);
}
// ExecutorService executorService = Executors.newFixedThreadPool(1);
class findMaxTask implements Runnable{
private int[]arr;
private int[] maxNum;
public findMaxTask(int[] arr,int[]maxNum){
this.arr=arr;
this.maxNum=maxNum;
}
@Override
public void run() {
long start = System.currentTimeMillis();
for (int i = 0;i<arr.length;i++) {
insert(maxNum,arr[i]);
}
for(int i=0;i<MaxNum;i++){
last[Position.getAndIncrement()]=maxNum[i];
}
long end = System.currentTimeMillis();
Long aLong = hashMap.get(Thread.currentThread().getName());
hashMap.put(Thread.currentThread().getName(),aLong+end-start);
}
}
int[] ints=new int[RoundCount];int rounds=0;
// while ((s!=null)&&rounds<2){
while ((s!=null)){
s = bufferedReader.readLine();
//最后一次可能不满数组
if (s==null){
System.out.println("file end ###"+" []Length : "+i);
if (i!=0) {
findMaxTask findMaxTask = new findMaxTask(ints,new int[MaxNum]);
rounds++;
executorService.submit(findMaxTask);
}
break;
}
Integer integer = Integer.valueOf(s);
ints[i]=integer;
i++;
if (i>=ints.length) {
findMaxTask findMaxTask = new findMaxTask(ints,new int[MaxNum]);
executorService.submit(findMaxTask);
i=0;
int[] ints0=new int[RoundCount];
ints=ints0;
rounds++;
}
}
while (Position.get()<last.length){
}
System.out.println("last task start");
findMaxTask findMaxTask = new findMaxTask(last,MaxAll);
executorService.submit(findMaxTask);
executorService.shutdown();
while (!executorService.awaitTermination(10, TimeUnit.MILLISECONDS)) {
}
System.out.println("MaxAll:");
for (int a:MaxAll){
System.out.print(a+" - ");
}
System.out.println("");
long sum=0;
for (Map.Entry<String ,Long> entry:hashMap.entrySet()){
System.out.println(entry.getKey()+" Occupy time :"+entry.getValue()+"ms");
sum+=entry.getValue();
}
System.out.println("sum of Occupy time: "+sum+"ms");
}
- 并未显著提高效率,多个写线程只占有很少一部分时间,大部分在等待读线程提供数据。
MPMC - 多生产者多消费者模式
public void findMaxByMPMC(String file,int MaxNum) throws IOException, InterruptedException {
int[] last=new int[RoundCount];
RandomAccessFile randomAccessFile = new RandomAccessFile(file, "rw");
FileChannel channel = randomAccessFile.getChannel();
int findMaxThreadNum = java.lang.Runtime.getRuntime().availableProcessors()-readFileThreadNum;
ExecutorService executorReadFileService = Executors.newFixedThreadPool(readFileThreadNum);
ExecutorService executorFindMaxService = Executors.newFixedThreadPool(findMaxThreadNum);
HashMap<String, Long> readFileMap = new HashMap<>();
for(int K=1;K<=readFileThreadNum;K++){
String threadName="pool-1-thread-"+K;
readFileMap.put(threadName,0L);
}
HashMap<String, Long> findMaxMap = new HashMap<>();
for(int K=1;K<=findMaxThreadNum;K++){
String threadName="pool-2-thread-"+K;
findMaxMap.put(threadName,0L);
}
long position=0;
long size=(Threshould*dataSize)/ReadTaskNum;
long num=Threshould/ReadTaskNum;
for(int i=0;i<ReadTaskNum;i++){
ReadTask readTask = new ReadTask(channel, size, position, executorFindMaxService,last,readFileMap,findMaxMap);
executorReadFileService.submit(readTask);
position+=size;
}
executorReadFileService.shutdown();
while (Position.get()<last.length){
}
System.out.println("last task start");
findMaxTask findMaxTask = new findMaxTask(last,MaxAll,last,findMaxMap);
executorFindMaxService.submit(findMaxTask);
executorFindMaxService.shutdown();
while (!executorReadFileService.awaitTermination(10, TimeUnit.MILLISECONDS)) {
}
while (!executorFindMaxService.awaitTermination(10, TimeUnit.MILLISECONDS)) {
}
System.out.println("MaxAll:");
for (int a:MaxAll){
System.out.print(a+" - ");
}
System.out.println("");
long sum=0;
for (Map.Entry<String ,Long> entry:readFileMap.entrySet()){
System.out.println("readFileMap:"+entry.getKey()+" Occupy time :"+entry.getValue()+"ms");
sum+=entry.getValue();
}
System.out.println("readFileMap:sum of Occupy time: "+sum+"ms");
sum=0;
for (Map.Entry<String ,Long> entry:findMaxMap.entrySet()){
System.out.println("findMaxMap:"+entry.getKey()+" Occupy time :"+entry.getValue()+"ms");
sum+=entry.getValue();
}
System.out.println("findMaxMap:sum of Occupy time: "+sum+"ms");
}
-
大致流程为:先把1亿数据分为10份,每份1千万个数,每个数据占10字节一共1亿字节,通过偏移量就可以很快的分割数据了。
-主线程 产生10个ReadTask,交给生产者执行。 -
每份数据由生产者线程处理,生产者线程每次获取到10万个数就生成一个分析任务,由分析线程池执行。
-
每个生产者产生100个findMaxtask,交给消费者执行。
-
10(生产者)*100(findMaxTask)*100(每个findMaxTask获得最大100个数)=10万个数
-
最后一个任务–》从这10万个个数获得最大的100个数
-
调整生产者:消费者比例,统计时间消耗
- 调整生产者:消费者比例=1:11
- 调整生产者:消费者比例=2:10
- 调整生产者:消费者比例=3:9
- 调整生产者:消费者比例=4:8
* - 调整生产者:消费者比例=5:7
* - 调整生产者:消费者比例=6:6
- 调整生产者:消费者比例=7:5
- 调整生产者:消费者比例=8:4
- 调整生产者:消费者比例=9:3
- 调整生产者:消费者比例=10:2
- 调整生产者:消费者比例=11:1
- 消耗时间表
- 在10个生产者线程是消耗时间有一个大的下降,由于任务分成10份,刚好同时启动,效率提高很多
如果是1000亿数据呢,一台机器都无法储存下,获取最大k个数(topk问题)
- 自然而然的想到每个机器合并一部分之后合并,这就是Mapreduce的思想
- hadoop单机Mapreduce方法
- 未完待续