1G的日志文件,需要从中搜索出指定字符串所在的行和行中位置。有两种方法,一种直接使用java的函数,一种通过调用Linux shell命令辅助处理。下面是示例程序:
# cat TestIO.javaimport java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Pattern;
public class TestIO
{
private int lineNum = 0;
private String path = "";
private String searchStr = "";
public void setPath(String value)
{
path = value;
}
public String getPath()
{
return path;
}
public void setSearchStr(String value)
{
searchStr = value;
}
public String getSearchStr()
{
return searchStr;
}
/**
* Java search by index
*/
public void start()
{
if(null == path || path.length()<1)
return;
try
{
long startMili=System.currentTimeMillis();
System.out.println("Start search \""+searchStr+"\" in file: "+path);
File file = new File(path);
BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file));
BufferedReader reader = new BufferedReader(new InputStreamReader(fis,"utf-8"));
String line = "";
lineNum = 0;
while((line = reader.readLine()) != null)
{
lineNum ++;
String rs = this.searchStr(line, searchStr);
if(rs.length()>0)
{
// System.out.println("Find in Line["+lineNum+"], index: "+rs);
}
}
System.out.println("Finished!");
long endMili=System.currentTimeMillis();
System.out.println("Total times: "+(endMili-startMili)+" ms");
System.out.println("");
}
catch(Exception e)
{
e.printStackTrace();
}
}
/**
* Call shell command to search
*/
public void startByShell()
{
try
{
long startMili=System.currentTimeMillis();
System.out.println("Start search \""+searchStr+"\" in file: "+path+ " by shell");
String[] cmd = {"/bin/sh", "-c", "grep "+searchStr+" "+path+" -n "};
Runtime run = Runtime.getRuntime();
Process p = run.exec(cmd);
BufferedInputStream in = new BufferedInputStream(p.getInputStream());
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
String line = "";
lineNum = 0;
while((line = reader.readLine()) != null)
{
lineNum ++;
String rs = this.searchStr(line.substring(line.indexOf(':')+1), searchStr);
if(rs.length()>0)
{
String linebyshell = line.substring(0, line.indexOf(':'));
//System.out.println("Find in Line["+linebyshell+"], index: "+rs);
}
}
System.out.println("Finished!");
long endMili=System.currentTimeMillis();
System.out.println("Total times: "+(endMili-startMili)+" ms");
System.out.println("");
}
catch(Exception e)
{
e.printStackTrace();
}
}
public String searchStr(String src, String value)
{
String result = "";
int index = src.indexOf(value,0);
while(index>-1)
{
result+=index+",";
index = src.indexOf(value,index+value.length());
}
return result;
}
public static boolean isNumeric(String str)
{
Pattern pattern = Pattern.compile("[0-9]*");
return pattern.matcher(str).matches();
}
/**
* @param args
*/
public static void main(String[] args)
{
String file = "./testfile.txt";
TestIO test = new TestIO();
if(args.length>0)
test.setPath(args[0]);
else
test.setPath(file);
if(args.length>1)
test.setSearchStr(args[1]);
else
test.setSearchStr("hello");
test.start();
test.startByShell();
}
}
测试文件1.4G,百万条日志记录。其中
关键字hello只有不到50条记录;
chipkill占20%左右记录数;
error占50%左右记录数;
mainbuild166占99%左右记录数;
测试结果:
[root@mainbuild166 io]# java TestIO ./testfile.txt hello
Start search "hello" in file: ./testfile.txt
Finished!
Total times: 7825 ms
Start search "hello" in file: ./testfile.txt by shell
Finished!
Total times: 3080 ms
[root@mainbuild166 io]# java TestIO ./testfile.txt chipkill
Start search "chipkill" in file: ./testfile.txt
Finished!
Total times: 8760 ms
Start search "chipkill" in file: ./testfile.txt by shell
Finished!
Total times: 3732 ms
[root@mainbuild166 io]# java TestIO ./testfile.txt error
Start search "error" in file: ./testfile.txt
Finished!
Total times: 11339 ms
Start search "error" in file: ./testfile.txt by shell
Finished!
Total times: 8163 ms
[root@mainbuild166 io]# java TestIO ./testfile.txt mainbuild166
Start search "mainbuild166" in file: ./testfile.txt
Finished!
Total times: 9938 ms
Start search "mainbuild166" in file: ./testfile.txt by shell
Finished!
Total times: 12531 ms
从以上测试结果中可以看出,当结果集远小于数据集时,采用调用shell的方法效率远比直接使用java函数的高,这也是很符合实际中的情况的。