大数据的概念炒的很火,无非就是时代变了,信息量大了,需要处理的数据量也变大。
这就考验了现代计算机的处理能力,于是聪明的人提出了分布式计算的概念,可以利用在一个网络中的所有计算机进行并行计算,大大增加了处理效率。
对于大数据的概念,最直观的就是有个大文件,需要对大文件中的数据进行处理,如果一次性读入内存处理,势必会造成计算机的巨大压力,比如有个1G的文件,要对立面的所有
数据进行排序,首先看物理开销,如果一次性排序,必然要把所有数据读入内存,至少需要1G的开销,排序需要额外的开销,对计算机要求很高;再来看时间开销,1G的数据直
接排序时间,(在另一篇博文会提到JAVA默认排序算法问题),约为n*logn,这是无法接受的。
巧干能捕雄狮,蛮干难捉蟋蟀(苏联谚语)。中国民间有,大事化小,小事化了的说法,对于这个问题,也可以大数据转化成小数据来处理。
具体思想为,把大文件分割足够多的小文件,对小文件进行操作,然后对处理后的小文件进行归并。
以下为一例子,如有错误,请指正。
package test;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
/**
* @author wenhuan
* skype:jasonwenhuan
*/
public class HugeDataSort {
public final static String ORIGINALPATH = "E:/bigdatatest/bigData.txt" ;
public final static String TEMPFILEPATH = "E:/bigdatatest/";
public final static String LASTFILEPATH = "E:/bigdatatest/";
public final static String LASTFILENAME = "last.txt";
public final static int BIGDATALENGTH = 10000000;
public final static int TEMPFILELENGTH = 1000000;
public static int rewriteTime = 1;
private static File tempFiles[];
public static int writeTime = 0;
public static int threadNumber = 2;
public static void main(String[] args) throws IOException {
generateDate();
splitBigFileToLittleFile();
unitAllTempFileAndDeleteTempFile();
}
public static void generateDate() throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(ORIGINALPATH ));
Random random = new Random();
for (int i = 0; i < BIGDATALENGTH; i++) {
writer.write(String. valueOf(random.nextInt(BIGDATALENGTH)) + "\n");
}
writer.close();
}
public static void splitBigFileToLittleFile() throws IOException {
BufferedReader br = new BufferedReader(new FileReader(ORIGINALPATH ));
tempFiles = new File[BIGDATALENGTH / TEMPFILELENGTH];
for (int i = 0; i < tempFiles. length; i++) {
tempFiles[i] = new File(TEMPFILEPATH + "sortTempFile" + i + ".txt");
BufferedWriter writer = new BufferedWriter(new FileWriter(
tempFiles[i]));
List<Integer> smallLine = new ArrayList<Integer>();
for (int j = 0; j < TEMPFILELENGTH; j++) {
String text = null;
if ((text = br.readLine()) != null) {
smallLine.add(Integer. parseInt(text));
}
}
Collections. sort(smallLine);
for (Integer line : smallLine) {
writer.write(String. valueOf(line)
+ System.getProperty("line.separator"));
}
writer.close();
}
}
public static void multiWaysMergeSort(String[] files) throws IOException {
if (files.length == 1) {
String lastFilePath = LASTFILEPATH + LASTFILENAME ;
copyFile(files[0],lastFilePath,false);
deleteFile(files[0]);
return;
}
/*List<String> listFiles = Arrays.asList(files);
int filesEveryThread = tempFiles.length/threadNumber;
for( int j=0;j<threadNumber;j++){
int from = 0;
int to = 0;
from = filesEveryThread * j;
if (j == threadNumber - 1) {
to = listFiles.size();
} else {
to = threadNumber * (j + 1);
}
List<String> list = listFiles.subList(from, to);
}*/
for (int i = 0; i < files.length; i++) {
if(i == files.length -1){
renameFile(files[i],i);
break;
}
BufferedReader br1 = new BufferedReader(new FileReader(files[i]));
BufferedReader br2 = new BufferedReader(new FileReader(files[i+1]));
BufferedWriter writer = new BufferedWriter(new FileWriter(TEMPFILEPATH + "last_" + rewriteTime + "_" + i + ".txt") );
String s1 = br1.readLine();
String s2 = br2.readLine();
while (s1 != null || s2 != null) {
int mergeResult = -1;
if(s1 != null && s2 != null){
mergeResult = merge(Integer.parseInt(s1.toString()),
Integer. parseInt(s2.toString()));
}
if (mergeResult == 0) {
writer.write(s2);
writer.write(System. getProperty("line.separator"));
s2 = br2.readLine();
}
if (mergeResult == 1) {
writer.write(s1);
writer.write(System. getProperty("line.separator"));
s1 = br1.readLine();
s2 = br2.readLine();
}
if(mergeResult == 2){
writer.write(s1);
writer.write(System. getProperty("line.separator"));
s1 = br1.readLine();
}
if(s1 == null && s2 != null){
writer.write(s2);
writer.write(System. getProperty("line.separator"));
s2 = br2.readLine();
}
if(s2 == null && s1 != null){
writer.write(s1);
writer.write(System. getProperty("line.separator"));
s1 = br1.readLine();
}
System. out.println("write time : " + writeTime++);
}
br1.close();
br2.close();
deleteFile(files[i]);
deleteFile(files[i+1]);
i++;
writer.close();
}
rewriteTime++;
multiWaysMergeSort(getTempFiles ("last_" ));
}
public static int merge(int a, int b) {
if (a > b) {
return 0;
} else if (a == b) {
return 1;
} else {
return 2;
}
}
public static void unitAllTempFileAndDeleteTempFile() throws IOException {
String[] files = getTempFiles("sortTempFile");
multiWaysMergeSort(files);
}
public static String[] getTempFiles(final String startName) {
File f = new File(TEMPFILEPATH );
String[] files = f.list( new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(startName == null ? "" : startName);
}
});
String[] retFiles = new String[files.length ];
for (int i = 0; i < files.length; i++) {
retFiles[i] = TEMPFILEPATH + files[i];
}
return retFiles;
}
public static void copyFile(String org, String dst, boolean useBuffer) {
FileInputStream fis = null;
FileOutputStream fos = null;
BufferedOutputStream bos = null;
try {
fis = new FileInputStream(org);
fos = new FileOutputStream(dst);
bos = new BufferedOutputStream(new FileOutputStream(dst));
int length = 0;
byte[] bytes = new byte[1024];
while ((length = fis.read(bytes)) != -1) {
if (useBuffer) {
bos.write(bytes, 0, length);
} else {
fos.write(bytes, 0, length);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (fos != null) {
try {
fos.close();
} catch (IOException e) {
}
}
if (bos != null) {
try {
bos.close();
} catch (IOException e) {
}
}
if (fis != null) {
try {
fis.close();
} catch (IOException e) {
}
}
}
}
public static boolean deleteFile(String filePath){
boolean flag = false;
File f = new File(filePath);
if(f.exists()){
f.delete();
flag = true;
}
return flag;
}
public static boolean renameFile(String fileName, int i){
File file = new File(fileName);
return file.renameTo(new File(TEMPFILEPATH + "last_" + rewriteTime + "_" + i + ".txt"));
}
class MyThread implements Runnable{
@Override
public void run() {
}
}
}