Regular Expression实现

15 篇文章 0 订阅

主要分2大块,

核心部分:就是一个NFA,只支持标准正则的操作,concatenation, union, iteration,限定上限的iteration,对应的meta character只有 (, ), |, *, ., {upper}

扩展部分:这部分是把扩展正则表达式转化为标准正则表达式,扩展正则表达式支持集合[a-z0-9],  重复次数上下界限制 {lower, upper},+, ?等

一:核心NFA部分

对于DFA来说,内部就是一张状态转移表table[state][next input char];对于NFA,内部就是一个epsilon状态转移有向图。DFA的状态转移必须由输入字符触发,而且转移是确定的。NFA可以不读输入进行转移(而且有多种转移),对应meta character,比如*,可以back reference之前的字符,而“(”和“)”也不需要consume输入,因为是meta character,不对应输入字符。NFA的不确定体现在,有多种转移,可以到达多种状态,那些状态又可以有多种转移,所以是一个Digraph的dfs问题,所以判断match的条件是从初始状态出发dfs能够到达终止状态,是一个路径搜索问题。

1 simulate(判断匹配)

1)从状态0 dfs,遍历epsilon转移,得出一个初始状态集

2)对于当前输入字符,

2).1 尝试从当前状态集的每个状态匹配,这个是consume输入的。转移到下一个状态,if (re.charAt(v) == c || re.charAt(v) == '.') states.add(v + 1)

2).2 从2).1得到的新的状态集出发多源dfs, 进行epsilon转移遍历,得出新的状态集,作为下一个字符对应的状态集

3)match判断,最终状态集里如果包含终止状态,说明终止状态是可达的。


2 epsilon转移Digraph的构建,读入标准正则表达式

1)对于(,)和* 添加一条到下一个字符的转移

2)对于iteration操作符*,添加*到back reference的实体的起始位置的双向转移,实体是单个字符或者子regex(被小括号括着)

3)对于Union操作符|,规定必须由小括号括着,即必须是(a|b),添加从(到|之后的实体的转移,以及|之后的实体到)的转移

4)对于有上限的iteration {upper}, 和*类似,添加从{到之前实体的双向转移,和从{到下一个字符(}之后)的转移。但是从{往前back reference的转移的边是带权的,权值就是重复次数 - 1。为了支持限定次数的iteration,Digraph是带权的,普通边权值设定为-1,带整数权值的边表示这条边只能走这么多次,相应的dfs算法也会做判断,每走一次decrease一次权值,直到0,0表示这条边不能走,相当于没有。


二:扩展的正则表达式支持

对原始输入的正则表达式进行处理,转化成标准正则表达式

[a-z0-9] 转化成((a|b)|c...

x+ 转化成 xx*

x?转化成x{1}

x{2, 3}转化成 xx{3-2}即xx{1}

[^abc] 先求差集charset - [abc],在转化成((a|b)|c)的形式

package excercise;
import java.util.*;
class Edge {
	int from, to, weight;
	public Edge(int from, int to, int w) {
		this.from = from;
		this.to = to;
		this.weight = w;
	}
}
class Digraph {
	private List<Edge>[] adj;
	private int V;
	public Digraph(int V) {
		this.V = V;
		adj = new List[V];
		for (int v = 0; v < V; ++v)
			adj[v] = new ArrayList<Edge>();
	}
	public void addEdge(int v, int w, int weight) {
		this.adj[v].add(new Edge(v, w, weight));
	}
	public void addEdge(int v, int w) { addEdge(v, w, -1);}
	public int V() {return V;}
	public Iterable<Edge> adj(int v) { return adj[v];}
}
class DirectedDFS {
	private boolean[] marked;
	private Digraph G;
	public DirectedDFS(Digraph G, int v) {
		marked = new boolean[G.V()];
		this.G = G;
		dfs(v);
	}
	public DirectedDFS(Digraph G, Iterable<Integer> s) {
		marked = new boolean[G.V()];
		this.G = G;
		for (int v : s) dfs(v);
	}
	public boolean marked(int v) {return marked[v];}
	private void dfs(int v) {
		marked[v] = true;
		for (Edge e : G.adj(v)) {
			if (!marked[e.to]) {
				if (e.weight < 0) dfs(e.to);
				else if (e.weight > 0) {
					e.weight--;
					dfs(e.to);
				}
			}
		}
	}
}
class NFA {
	private String re;
	private Digraph G;
	private int M;
	public NFA(String regex) {
		re = regex;
		M = regex.length();
		G = buildEpsilonTransitionGraph();
	}
	private Digraph buildEpsilonTransitionGraph() {
		Digraph G = new Digraph(M + 1);		
		Stack<Integer> ops = new Stack<Integer>();
		for (int i = 0; i < M; ++i) {
			int lp = i;
			if (re.charAt(i) == '(' || re.charAt(i) == '|') ops.push(i);
			else if (re.charAt(i) == ')') {
				int or = ops.pop();
				if (re.charAt(or) == '|') {
					lp = ops.pop();
					G.addEdge(or, i);
					G.addEdge(lp, or + 1);
				}
				else lp = or;
			}
			if (i < M - 1 && re.charAt(i + 1) == '*') {
				G.addEdge(lp, i + 1);
				G.addEdge(i + 1, lp);
			}
			if (i < M - 1 && re.charAt(i + 1) == '{') {
				int rb = re.indexOf('}', i + 1);
				int num = Integer.parseInt(re.substring(i + 2, rb));
				G.addEdge(lp, i + 1);
				G.addEdge(i + 1, lp, num - 1);
			}
			if (re.charAt(i) == '(' || re.charAt(i) == ')' || re.charAt(i) == '*' )
				G.addEdge(i, i + 1);
			else if (re.charAt(i) == '{') {
				int rb = re.indexOf('}', i);
				G.addEdge(i, rb + 1);
			}
		}
		return G;
	}
	public boolean recognizes(String text) {
		List<Integer> e_closure = new ArrayList<Integer>(); 
		DirectedDFS dfs = new DirectedDFS(G, 0);
		for (int v = 0; v < G.V(); ++v) //compute e-closure(0), epsilon closure of start state
			if (dfs.marked(v)) e_closure.add(v);
		
		for (int i = 0; i < text.length(); ++i) {
			char c = text.charAt(i);
			List<Integer> states = new ArrayList<Integer>();
			for (int v : e_closure) { //compute Y = c(X), the state set Y that state set X could goto on c
				if (v == M) continue;
				if (re.charAt(v) == c || re.charAt(v) == '.') states.add(v + 1);
			}
			if (states.isEmpty()) return false;
			dfs = new DirectedDFS(G, states); 
			e_closure = new ArrayList<Integer>(); //compute e-closure(Y)
			for (int v = 0; v < G.V(); ++v)
				if (dfs.marked(v)) e_closure.add(v);
			if (e_closure.isEmpty()) return false;
		}
		
		for (int v : e_closure)
			if (v == M) return true;
		return false;
	}
}
public class RE {
	private NFA nfa;
	private final Set<Character> metaChars;
	
	public RE(String regex) {
		char[] metaArray = {'*', '(', ')', '[', ']', '{', '}', '.', '|'};
		metaChars = new HashSet<Character>();
		for (char c : metaArray) metaChars.add(c);
		String afterProcess = preProcess(regex);
	//	System.out.println(afterProcess);
		nfa = new NFA(afterProcess);
	}
	public  boolean recognizes(String text) {
		return nfa.recognizes(text);
	}
	private String preProcess(String s) {
		StringBuilder sb = new StringBuilder();
		int lastREStart = -1;
		int leftParenthese = -1;
		int or = -1;
		for (int i = 0; i < s.length(); ++i) {
			if (s.charAt(i) == '[') {
				int rb = s.indexOf(']', i + 1);
				lastREStart = sb.length();
				sb.append(handleBracket(s, i, rb));
				i = rb;
			}
			else if (s.charAt(i) == '{') {
				int end = sb.length();
				int comma = s.indexOf(',', i + 1);
				int num = Integer.parseInt(s.substring(i + 1, comma));
				for (int j = 0; j < num; ++j)
					sb.append(sb.substring(lastREStart, end));
				int rb = s.indexOf('}', comma + 1);
				int num2 = Integer.parseInt(s.substring(comma + 1, rb));
				sb.append("{" +  (num2 - num) + "}");
				i = rb;
			}
			else if (s.charAt(i) == '+') {
				sb.append(sb.substring(lastREStart, sb.length()));
				sb.append('*');
			}
			else if (s.charAt(i) == '?') {
				sb.append("{1}");
			}
			else if (s.charAt(i) == '|') {
				if (or > 0) {
					sb.insert(0, '(');
					sb.append(')');
					lastREStart = 0;
				}
				sb.append(s.charAt(i));
				or = sb.length() - 1;
			}
			else {
				if (leftParenthese == -1) lastREStart = sb.length();
				sb.append(s.charAt(i));
				if (s.charAt(i) == '(') leftParenthese = i;
				else if (s.charAt(i) == ')') {
					lastREStart = leftParenthese;
					leftParenthese = -1;
				}
			}
		}
		if (or > 0) {
			sb.insert(0, '(');
			sb.append(')');
			lastREStart = 0;
		}
		return sb.toString();
	}
	private String handleBracket(String s, int l, int r) {
		boolean filter = true;
		boolean[] include = new boolean[256];
		if (s.charAt(l + 1) == '^') {
			filter = false;
			++l;
		}
		for (int i = l + 1; i < r; ++i) {
			if (s.charAt(i) != '-') 
				include[s.charAt(i)] = true;
			else {
				for (char c = (char)(s.charAt(i - 1) + 1); c < s.charAt(i + 1); ++c) 
					include[c] = true;
			}
		}
		ArrayDeque<Character> ad = new ArrayDeque<Character>();
		for (char c = 0; c < 256; ++c) {
			if (include[c] == filter && !metaChars.contains(c)) {
				if (ad.isEmpty()) ad.addLast(c);
				else {
					ad.addLast('|');
					ad.addLast(c);
					ad.addFirst('(');
					ad.addLast(')');
				}
			}
		}
		char[] a = new char[ad.size()];
		for (int i = 0; i < a.length; ++i)
			a[i] = ad.pollFirst();
		return new String(a);
	}
}




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值