外部归并排序Java实现

 

 

package mergesort;

import java.text.DecimalFormat;
import java.util.Random;

public class Record {
	private int A;
	private String B;
	private String C;
	
	@Override
	public String toString() {
		String tempA = new  DecimalFormat("0000000000").format(this.A);
		return tempA+"#"+B+"#"+C;
	}
	
	public String getRecordString(){
		String A = new  DecimalFormat("0000000000").format(Math.abs( new Random().nextInt()));
		String B = "郭涛"+A;
		String C = "1111111111000000000011111111110000000000111111111100000000001111111111";
		return A+"#"+B+"#"+C;
	}
	

	public Record() {
		super();
	}

	public Record(String line) {
		super();
		String [] t = line.split("#");
		this.A = Integer.valueOf(t[0]);
		this.B = t[1];
		this.C = t[2];
	}


	public int getA() {
		return A;
	}


	public void setA(int a) {
		A = a;
	}


	public String getB() {
		return B;
	}


	public void setB(String b) {
		B = b;
	}


	public String getC() {
		return C;
	}


	public void setC(String c) {
		C = c;
	}
	
}

 

 

 

package mergesort;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;

/**
 * 生成一个具有10,000,000个记录的文本文件,其中每个记录由100个字节组成。实验只考虑记录的一个属性A,假定A为整数类型。
 * 记录在block上封装时,采用non-spanned方式,即块上小于一个记录的空间不使用。Block的大小可在自己的操作系统上查看,xp一般为4096 bytes。
 * 在内存分配50M字节的空间用于外部merge-sort。
 * 
 * @author GT 2012-10-16
 *
 */
public class MergeSort {
	private int size = 10000000;//总记录数10000000
	private int sizePerBlock = 40;//由于磁盘上每个block大小是4KB,每条记录大小为100B,所以每个block上记录条数为40
	private int sizePerMemory = 500000;//分配50M内存进行内存排序,每个记录大小100B,所以大概每次排序50W条记录,这里取个整数500000
	private int fileSize = size/sizePerMemory;//归并生成的小文件数   20
	private int blockSize = (size%sizePerBlock)==0?(size/sizePerBlock):(size/sizePerBlock)+1; //总的块数250000
	private int blockSizePerFile = (sizePerMemory%sizePerBlock)==0?(sizePerMemory/sizePerBlock):(sizePerMemory/sizePerBlock)+1; //每个小文件中的块数12500
	private int charBufferSizeOfReader = 4096; //第二阶段的排序中每个子列表使用一个block大小的缓冲区
	private int charBufferSizeOfWriter = 41943040; //第二阶段的排序中输出使用的缓冲区大小40M
	private String fileDirectory = "F:\\record3\\";
	private String recordFile = fileDirectory+"record.txt";
	
	
	private String sortedRecordFile = fileDirectory+"sorted_record.txt";
	
	public void creat() throws Exception{
		long start = new Date().getTime();
		BufferedWriter out = new BufferedWriter(new FileWriter(recordFile));
		for(int j = 0;j<blockSize;j++){
			for(int i =0;i<sizePerBlock;i++){
				out.write(new Record().getRecordString());
				out.newLine();
			}
			out.write(new char[94]);//填充94个byte
			out.newLine();//占两个byte
		}
	
		out.close();
		long end = new Date().getTime();
		System.out.println("生成数据耗时 :"+(end - start)+"ms");
		
	}
	
	public void read() throws Exception{
		BufferedReader in = new BufferedReader( new FileReader(recordFile));
		String line;
		
		for(int j = 0;j<blockSize;j++){
			for(int i =0;i<sizePerBlock;i++){
				line = in.readLine();
				Record r = new Record(line);
				System.out.println(r.getB());
			}
			in.readLine();
		}
		in.close();
		
	}
	
	Comparator<Record> comparator = new Comparator<Record>(){
		public int compare(Record r1,Record r2)
		{
			if(r1.getA()>=r2.getA()) return 1;
			else return 0;
		}
	};

	public void memorySort() throws Exception{
		long start = new Date().getTime();
		BufferedReader in = new BufferedReader( new FileReader(recordFile));
		
		String line;
		for(int k =0;k<fileSize;k++){//20
			Record records[] =  new Record[sizePerMemory];
			BufferedWriter out = new BufferedWriter(new FileWriter(fileDirectory+"record_"+k+".txt"));
			for(int j = 0;j<blockSizePerFile;j++){//12500
				for(int i =0;i<sizePerBlock;i++){//40
					line = in.readLine();
					records[j*sizePerBlock+i] =new Record(line);
				}
				in.readLine();
			}
			Arrays.sort(records,comparator);//主存排序
			
			for(int j = 0;j<blockSizePerFile;j++){//12500
				for(int i =0;i<sizePerBlock;i++){//40
					out.write(records[j*sizePerBlock+i].toString());
					out.newLine();
				}
				out.write(new char[94]);//填充94个byte
				out.newLine();//占两个byte
			}
			out.close();
		}
		in.close();
		long end = new Date().getTime();
		System.out.println("内存排序耗时 :"+(end - start)+"ms");
	}
	
	public void mergeSort() throws Exception{
		long start = new Date().getTime();
		BufferedWriter out = new BufferedWriter(new FileWriter(sortedRecordFile),charBufferSizeOfWriter);
		BufferedReader in [] = new BufferedReader[fileSize];
		for(int i =0;i<fileSize;i++){
			in[i] = new BufferedReader( new FileReader(fileDirectory+"record_"+i+".txt"),charBufferSizeOfReader);
		}
		Record rs[] = new Record[fileSize];
		Boolean finish [] = new Boolean[fileSize];
		for(int i =0;i<fileSize;i++) {
			rs[i]=new Record(in[i].readLine());
			finish[i]= false;
		}
		Record min;
		String line;
		int finishCount = 0;
		int count = 0;
		while(true){
			
			int firstFalse = 0;//找到第一个没有写完的文件序列值
			for(int i=0;i<fileSize;i++){
				if(finish[i]==true)
					firstFalse =i+1;
				else
					break;
			}
			if(firstFalse>=fileSize) break;
			if(finishCount>=fileSize) break;
			min = rs[firstFalse];
			int j =firstFalse;
			
			for(int i =firstFalse+1;i<fileSize;i++){
				if(!finish[i]&&(rs[i].getA()<min.getA())){
					min = rs[i];
					j = i;
				}
			}
			if((count!=0)&&(count%sizePerBlock==0)){
				out.write(new char[94]);//填充94个byte
				out.newLine();//占两个byte
			}
			out.write(min.toString());
			out.newLine();
			
			
			if(!finish[j]){
				line = in[j].readLine();
				if(line!=null){
					if("".equals(line.trim()))
					{
						line = in[j].readLine();
						if(line==null){
							finish[j] = true;
							finishCount++;
						}
					}else {
						rs[j]= new Record(line);
					}
				}else {
					finish[j] = true;
					finishCount++;
				}
			}
			count++;
		}
		
		for(int i =0;i<fileSize;i++){
			in[i].close();
		}
		out.close();
		
		long end = new Date().getTime();
		System.out.println("外存排序耗时 :"+(end - start)+"ms");	
	}
	
	public static void main(String [] args) throws Exception{

//		long start = new Date().getTime();
		MergeSort ms = new MergeSort();
//		ms.creat();
//		ms.read();
//		ms.memorySort();
		ms.mergeSort();
		
//		long end = new Date().getTime();
//		System.out.println("the time is :"+(end - start)+"ms");

	}
}

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
先让我们看看原题的三个任务介绍: Task 1: Sorting the LINEITEM table by External Merge Sort Consider two cases: 1) using 5 buffer pages in memory for the external merge sort; 2) using 129 buffer pages in memory for the external merge sort. In the implementation, each buffer page occupies 8K bytes. The ORDERKEY attribute of the LINEITEM table is assumed to be the sort key in the external merge sort. Please report the number of passes and also the running time of the external merge sort in each case. Task 2: Organizing the sorted LINEITEM table into disk pages Please use the page format for storing variable-length records to organize the LINEITEM table sorted in Task 1. In the implementation, each disk page occupies 1K bytes. For each page we maintain a directory of slots, with a pair per slot. Both “record offset” and “record length” are 4 bytes wide. Task 3: Building a B-Tree over LINEITEM disk pages by Bulk Loading. Please use bulk loading to build a B-Tree over the disk pages of the LINEITEM table, which are generated in Task 2. The ORDERKEY attribute of the LINEITEM table is used as the (search) key for building the B-Tree. In the B-Tree, each internal node corresponds to a page of 1K bytes, both key and pointer are 4 bytes wide. Please report the running time of the bulk loading. A query interface is required for checking the B-Tree. For a reasonable ORDERKEY value, please print out all the pages visited along the path to find the corresponding record. Please also report the running time of the search.
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值