字符串查找---基于KMP算法在多层级目录文件下查找特定模式子字符串
本文参考《算法(第4版)》
基于KMP算法在多层级目录文件下查找特定模式子字符串
本文基于KMP算法在多层级目录文件下查找特定模式子字符串,核心算法是KMP算法。
其中walk()方法中使用List listFilePath存放所有文件的绝对路径。使用read()方法从批量文件中循环查找每个文件中是否含有特定的模式子字符串,将含有特定模式子字符串的文件名打印出来。
1.实现代码
package subStringSearch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class KMPInFiles {
private String pat;
private int[][] dfa;
List<File> listDirectory = new ArrayList<File>();//存储所有路径名称
List<File> listFile = new ArrayList<File>(); //存储所有文件
List<String> listDirPath = new ArrayList<String>(); //存储绝对路径
List<String> listFilePath = new ArrayList<String>();//存储文件绝对路径
public KMPInFiles(String pat){
this.pat = pat;
int M = pat.length();
int R = 65536;
dfa = new int[R][M];
dfa[pat.charAt(0)][0] = 1;
for(int X = 0, j = 1; j < M; j++){
for(int c = 0; c < R; c++)
dfa[c][j] = dfa[c][X]; //复制匹配失败情况下的值
dfa[pat.charAt(j)][j] = j + 1; //设置匹配成功情况下的值
X = dfa[pat.charAt(j)][X]; //更新重启状态
}
}
public int search(String txt){
int i, j, N = txt.length(), M = pat.length();
for(i = 0, j = 0; i < N && j < M; i++)
j = dfa[txt.charAt(i)][j];
if(j == M) return i - M;
else return N;
}
public void read(String pat, String infile) throws IOException{
BufferedReader in = new BufferedReader(new FileReader(new File(infile)));
String text;
int row = 0;
while((text=in.readLine())!=null){
++row;
/*System.out.println(row);
System.out.println(text);
System.out.println();
System.out.println();*/
if( row > 2 ) //根据先验知识控制读取行数
break;
int index = this.search(text);
if(index <= text.length()-pat.length()){
System.out.println("filename = " + infile +" ,pat = " + pat + " ,subtxt = " + text.substring(index, index+pat.length()));
break;
}
}
in.close();
}
public void walk(File infile, String fullpath){//获取上一层路径
File[] fileArr = infile.listFiles();
String curPath;
String curFileName;
for(File fs : fileArr){
if(fs.isDirectory()){
curPath = fs.getName();
listDirectory.add(fs);
String dirpath = fullpath + curPath + "/";
listDirPath.add(dirpath);
walk(fs, dirpath);//遇到路径继续进行递归调用
}
else{
listFile.add(fs);
curFileName = fs.getName();
listDirectory.add(fs);
String filepath = fullpath + curFileName;
listFilePath.add(filepath);//遇到文件则结束递归调用
}
}
}
public static void main(String[] args) throws IOException {
String pat = "4adf49dd-0a34-4220-8b5d-50f34ee34b53";//要查找的模式子字符串
String infile = "C:/Users/Administrator/Desktop/2/";//多层目录文件及文件夹
File file = new File(infile);
KMPInFiles kmp = new KMPInFiles(pat);
//kmp.read(pat, infile);
kmp.walk(file, infile);
/* for(File fs : kmp.listFile)
System.out.println(fs.getName());
for(File fs : kmp.listDirectory)
System.out.println(fs.getName());
for(String fs : kmp.listFilePath)
System.out.println(fs);
for(String fs : kmp.listDirPath)
System.out.println(fs); */
for(String fs : kmp.listFilePath){
//System.out.println(fs);
kmp.read(pat, fs);
}
}
}
输出:
filename = C:/Users/Administrator/Desktop/2/45/收费统计.rdx ,pat = 4adf49dd-0a34-4220-8b5d-50f34ee34b53 ,subtxt = 4adf49dd-0a34-4220-8b5d-50f34ee34b53