java如何大文件中查找字符创_大文件字符串搜索之Java函数和调用Shell搜索的效率测试...

1G的日志文件,需要从中搜索出指定字符串所在的行和行中位置。有两种方法,一种直接使用java的函数,一种通过调用Linux shell命令辅助处理。下面是示例程序:

# cat TestIO.javaimport java.io.BufferedInputStream;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.InputStreamReader;

import java.util.regex.Pattern;

public class TestIO

{

private int lineNum = 0;

private String path = "";

private String searchStr = "";

public void setPath(String value)

{

path = value;

}

public String getPath()

{

return path;

}

public void setSearchStr(String value)

{

searchStr = value;

}

public String getSearchStr()

{

return searchStr;

}

/**

* Java search by index

*/

public void start()

{

if(null == path || path.length()<1)

return;

try

{

long startMili=System.currentTimeMillis();

System.out.println("Start search \""+searchStr+"\" in file: "+path);

File file = new File(path);

BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file));

BufferedReader reader = new BufferedReader(new InputStreamReader(fis,"utf-8"));

String line = "";

lineNum = 0;

while((line = reader.readLine()) != null)

{

lineNum ++;

String rs = this.searchStr(line, searchStr);

if(rs.length()>0)

{

// System.out.println("Find in Line["+lineNum+"], index: "+rs);

}

}

System.out.println("Finished!");

long endMili=System.currentTimeMillis();

System.out.println("Total times: "+(endMili-startMili)+" ms");

System.out.println("");

}

catch(Exception e)

{

e.printStackTrace();

}

}

/**

* Call shell command to search

*/

public void startByShell()

{

try

{

long startMili=System.currentTimeMillis();

System.out.println("Start search \""+searchStr+"\" in file: "+path+ " by shell");

String[] cmd = {"/bin/sh", "-c", "grep "+searchStr+" "+path+" -n "};

Runtime run = Runtime.getRuntime();

Process p = run.exec(cmd);

BufferedInputStream in = new BufferedInputStream(p.getInputStream());

BufferedReader reader = new BufferedReader(new InputStreamReader(in));

String line = "";

lineNum = 0;

while((line = reader.readLine()) != null)

{

lineNum ++;

String rs = this.searchStr(line.substring(line.indexOf(':')+1), searchStr);

if(rs.length()>0)

{

String linebyshell = line.substring(0, line.indexOf(':'));

//System.out.println("Find in Line["+linebyshell+"], index: "+rs);

}

}

System.out.println("Finished!");

long endMili=System.currentTimeMillis();

System.out.println("Total times: "+(endMili-startMili)+" ms");

System.out.println("");

}

catch(Exception e)

{

e.printStackTrace();

}

}

public String searchStr(String src, String value)

{

String result = "";

int index = src.indexOf(value,0);

while(index>-1)

{

result+=index+",";

index = src.indexOf(value,index+value.length());

}

return result;

}

public static boolean isNumeric(String str)

{

Pattern pattern = Pattern.compile("[0-9]*");

return pattern.matcher(str).matches();

}

/**

* @param args

*/

public static void main(String[] args)

{

String file = "./testfile.txt";

TestIO test = new TestIO();

if(args.length>0)

test.setPath(args[0]);

else

test.setPath(file);

if(args.length>1)

test.setSearchStr(args[1]);

else

test.setSearchStr("hello");

test.start();

test.startByShell();

}

}

测试文件1.4G,百万条日志记录。其中

关键字hello只有不到50条记录;

chipkill占20%左右记录数;

error占50%左右记录数;

mainbuild166占99%左右记录数;

测试结果:

[root@mainbuild166 io]# java TestIO ./testfile.txt hello

Start search "hello" in file: ./testfile.txt

Finished!

Total times: 7825 ms

Start search "hello" in file: ./testfile.txt by shell

Finished!

Total times: 3080 ms

[root@mainbuild166 io]# java TestIO ./testfile.txt chipkill

Start search "chipkill" in file: ./testfile.txt

Finished!

Total times: 8760 ms

Start search "chipkill" in file: ./testfile.txt by shell

Finished!

Total times: 3732 ms

[root@mainbuild166 io]# java TestIO ./testfile.txt error

Start search "error" in file: ./testfile.txt

Finished!

Total times: 11339 ms

Start search "error" in file: ./testfile.txt by shell

Finished!

Total times: 8163 ms

[root@mainbuild166 io]# java TestIO ./testfile.txt mainbuild166

Start search "mainbuild166" in file: ./testfile.txt

Finished!

Total times: 9938 ms

Start search "mainbuild166" in file: ./testfile.txt by shell

Finished!

Total times: 12531 ms

从以上测试结果中可以看出,当结果集远小于数据集时,采用调用shell的方法效率远比直接使用java函数的高,这也是很符合实际中的情况的。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值