正则表达式中反向引用
下面的 regStr 中的 \\1 就是反向引用
因为正则的规则指定了 一个**()** 中的结果为一个分组,0号位置是该regStr匹配的结果,自己指定的分组从1开始,所以使用的是 \\1
下面的逻辑是去除连续重复的文字,只保留一个,即 结巴去重
public static void main(String[] args) {
String content = "我我的名名字字字叫叫叫叫马大哈";
String regStr = "([\u0391-\uffe5])\\1+";
Pattern pattern = Pattern.compile(regStr);
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
System.out.println("matcher.group(0) = " + matcher.group(0));
System.out.println("matcher.group(1) = " + matcher.group(1));
}
content = matcher.replaceAll("$1");
System.out.println("content = " + content);
}
但是。。。自己当时没搞懂,为什么这里$1之后,就可以去重
matcher.replaceAll("$1");
于是就大致看来一下方法的逻辑,这里做个简单记录
// java.util.regex.Matcher
public String replaceAll(String replacement) {
// 这里先重置了一下Matcher的变量数值
reset();
// 调用一下find()方法,也就是Matcher.find();
boolean result = find();
if (result) {
// 创建去重后的字符串缓冲区
StringBuffer sb = new StringBuffer();
do {
// 这里记录数据($1的主要处理 见下面)
appendReplacement(sb, replacement);
// 之后再次find();
result = find();
} while (result);
appendTail(sb);
return sb.toString();
}
return text.toString();
}
appendReplacement()方法中$1的处理逻辑
public Matcher appendReplacement(StringBuffer sb, String replacement) {
// If no match, return error
if (first < 0)
throw new IllegalStateException("No match available");
// Process substitution string to replace group references with groups
/**
* 这里的值初始为0,因此获取的是$符号
**/
int cursor = 0;
StringBuilder result = new StringBuilder();
while (cursor < replacement.length()) {
char nextChar = replacement.charAt(cursor);
if (nextChar == '\\') {
cursor++;
if (cursor == replacement.length())
throw new IllegalArgumentException(
"character to be escaped is missing");
nextChar = replacement.charAt(cursor);
result.append(nextChar);
cursor++;
} else if (nextChar == '$') {
// TODO auther 这里是$1进入的位置
// Skip past $
// 位置前移,即会获取到1,也就是分组
cursor++;
// Throw IAE if this "$" is the last character in replacement
// 这里 cursor是1,而replacement的长度是2,因此不成立
if (cursor == replacement.length())
throw new IllegalArgumentException(
"Illegal group reference: group index is missing");
// 这里获取到的就是 replacement.charAt(1) 也就是 "$1" 中的 1
nextChar = replacement.charAt(cursor);
int refNum = -1;
// if不成立,跳过
if (nextChar == '{') {
cursor++;
StringBuilder gsb = new StringBuilder();
while (cursor < replacement.length()) {
nextChar = replacement.charAt(cursor);
if (ASCII.isLower(nextChar) ||
ASCII.isUpper(nextChar) ||
ASCII.isDigit(nextChar)) {
gsb.append(nextChar);
cursor++;
} else {
break;
}
}
if (gsb.length() == 0)
throw new IllegalArgumentException(
"named capturing group has 0 length name");
if (nextChar != '}')
throw new IllegalArgumentException(
"named capturing group is missing trailing '}'");
String gname = gsb.toString();
if (ASCII.isDigit(gname.charAt(0)))
throw new IllegalArgumentException(
"capturing group name {" + gname +
"} starts with digit character");
if (!parentPattern.namedGroups().containsKey(gname))
throw new IllegalArgumentException(
"No group with name {" + gname + "}");
refNum = parentPattern.namedGroups().get(gname);
cursor++;
} else {
// The first number is always a group
// 进入 else逻辑
refNum = (int)nextChar - '0';
if ((refNum < 0)||(refNum > 9))
throw new IllegalArgumentException(
"Illegal group reference");
// cursor再次+1,是2
cursor++;
// Capture the largest legal group string
boolean done = false;
while (!done) {
// cursor == replacement.length() == 2,跳出while
if (cursor >= replacement.length()) {
break;
}
int nextDigit = replacement.charAt(cursor) - '0';
if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
break;
}
int newRefNum = (refNum * 10) + nextDigit;
if (groupCount() < newRefNum) {
done = true;
} else {
refNum = newRefNum;
cursor++;
}
}
}
// Append group
// 因为 refNum是1,并且对应的1号分组也不是1
if (start(refNum) != -1 && end(refNum) != -1)
// 这里获取对应的分组,并写入到text中
result.append(text, start(refNum), end(refNum));
} else {
result.append(nextChar);
cursor++;
}
}
// Append the intervening text
sb.append(text, lastAppendPosition, first);
// Append the match substitution
sb.append(result);
lastAppendPosition = last;
return this;
}
result.append(text, start(refNum), end(refNum)),start()方法和end()方法, 其实就是 matcher.group(refNum)
public int start(int group) {
if (first < 0)
throw new IllegalStateException("No match available");
if (group < 0 || group > groupCount())
throw new IndexOutOfBoundsException("No group " + group);
return groups[group * 2];
}
public int end(int group) {
if (first < 0)
throw new IllegalStateException("No match available");
if (group < 0 || group > groupCount())
throw new IndexOutOfBoundsException("No group " + group);
return groups[group * 2 + 1];
}
以上均为个人见解,如有错误,还请谅解!
本人学习来自 韩顺平老师的正则相关课程