JAVA匹配字符串 处理

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.oro.text.regex.PatternCompiler;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Substitution;
import org.apache.oro.text.regex.Util;
import org.json.JSONException;
import org.json.JSONObject;

import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import sun.io.*;

public class ConvertMsgContent {
private static Log log = LogFactory.getLog(ConvertMsgContent.class);

public static String formatAndSeriesTag(String content) {
return seriesTag(format(content));
}

public static List<ContentFragment> format(String content) {

// System.out.println(content);
if (content == null) return null;

List<ContentFragment> ret = new Stack<ContentFragment>();
StringBuilder thisElement = new StringBuilder();
StringBuilder oldContentBuilder = new StringBuilder(content);
int stats = 0;//0 标签外,1 <,2</...,3<...,4<.../
int tagDeep = 0;//标签深度
String tagName = "";
boolean tagNameFound = false;
for (int i = 0, j = oldContentBuilder.length(); i < j; i++) {
boolean closeTag = false;
char ch = oldContentBuilder.charAt(i);
if (ch == '<') { //标签开始
if (stats == 0) {
if (tagDeep == 0 && thisElement.length() > 0) {
ContentFragment instance = new ContentFragment(ContentFragment.TypeText, thisElement.toString());
ret.add(instance);
thisElement.delete(0, thisElement.length());
}
stats = 1;
tagNameFound = false;
tagName = "";
}
// thisElement.append(ch);
} else if (ch == ' ') {
if (stats == 3) tagNameFound = true;
// thisElement.append(ch);
} else if (ch == '/') { //标签结束
if (stats == 1) stats = 2;
else if (stats == 3) stats = 4;
// thisElement.append(ch);
} else if (ch == '>') {
if (stats == 2) {
tagDeep--;
stats = 0;
if (tagDeep == 0) closeTag = true;
} else if (stats == 3) {
tagDeep++;
if ("img".equalsIgnoreCase(tagName) || "meta".equalsIgnoreCase(tagName))
tagDeep--;//图片标签和meta标签可以不闭合
stats = 0;
if (tagDeep == 0) closeTag = true;
} else if (stats == 4) {
stats = 0;
if (tagDeep == 0) closeTag = true;
}
// thisElement.append(ch);
} else {
if (stats == 1) {
stats = 3;
} else if (stats == 4) stats = 3;
if (tagNameFound == false) tagName = tagName + ch;
// thisElement.append(ch);
}
thisElement.append(ch);
if (closeTag) {
ContentFragment instance = new ContentFragment(ContentFragment.TypeUndefine, thisElement.toString());
ret.add(instance);
thisElement.delete(0, thisElement.length());
analysisTag(instance);
}
}
if (thisElement.length() > 0) {
ContentFragment instance = new ContentFragment(ContentFragment.TypeText, deleteHtmlTag(thisElement.toString()));
ret.add(instance);
thisElement.delete(0, thisElement.length());
}
return ret;
}

public static String seriesTag(List<ContentFragment> tagList) {
StringBuilder stringBuilder = new StringBuilder();
for (ContentFragment tag : tagList) {
switch (tag.getType()) {
case ContentFragment.TypeText:
stringBuilder.append(tag.getAll());
break;
case ContentFragment.TypeAt:
stringBuilder.append('<').append(createStringJson("at", "name", tag.getExt1())).append('>');
// stringBuilder.append("<{\"data\":{\"name\":\"").append(tag.getExt1()).append("\"},\"type\":\"at\"}>");
break;
case ContentFragment.TypeFace:
stringBuilder.append('<').append(createStringJson("em", "em", tag.getExt1())).append('>');
// stringBuilder.append("<{\"data\":{\"em\":\"").append(tag.getExt1()).append("\"},\"type\":\"em\"}>");
break;
case ContentFragment.TypeHuati:
stringBuilder.append('<').append(createStringJson("topic", "topic", tag.getExt1())).append('>');
// stringBuilder.append("<{\"data\":{\"topic\":\"").append(tag.getExt1()).append("\"},\"type\":\"topic\"}>");
break;
case ContentFragment.TypeUrl:
stringBuilder.append('<').append(createStringJson("url", "short", tag.getExt1(), "long", tag.getExt2())).append('>');
// stringBuilder.append("<{\"data\":{\"short\":\"").append(tag.getExt1()).append("\",\"long\":\"").append(tag.getExt2()).append("\"},\"type\":\"url\"}>");
break;
case ContentFragment.TypePic:
stringBuilder.append('<').append(createStringJson("pic", "w", tag.getExt1(), "h", tag.getExt2(), "url", tag.getExt3())).append('>');
// stringBuilder.append("<{\"data\":{\"w\":\"").append(tag.getExt1()).append("\",\"h\":\"").append(tag.getExt2()).append("\",\"url\":\"").append(tag.getExt3()).append("\"},\"type\":\"pic\"}>");
break;
case ContentFragment.TypeUndefine:
stringBuilder.append(tag.getExt1());
break;
}
}
return stringBuilder.toString();
}

private static String createStringJson(String type, Object... keyValues) {
JSONObject ret = new JSONObject();
try {
JSONObject dataObject = new JSONObject();
for (int i = 0; i + 2 <= keyValues.length; i += 2) {
dataObject.put(keyValues[i].toString(), keyValues[i + 1]);
}
ret.put("type", type);
ret.put("data", dataObject);
} catch (JSONException e) {
log.error("createStringJson", e);
}
return ret.toString();
}

private static void analysisTag(ContentFragment instance) {
if (instance.type == ContentFragment.TypeUndefine) {
int pos = 0;
String displayContent = deleteHtmlTag(instance.getAll()).trim();
if (displayContent.startsWith("@")) {
instance.setType(ContentFragment.TypeAt);
instance.setExt1(displayContent.substring(1));
} else if (displayContent.startsWith("#") && displayContent.endsWith("#")) {
instance.setType(ContentFragment.TypeHuati);
instance.setExt1(displayContent.substring(1, displayContent.length() - 1));
} else if (displayContent.startsWith("http://")) {
instance.setType(ContentFragment.TypeUrl);
instance.setExt1(displayContent);
String originUrl = getUrlTitle(instance.getAll());
if (originUrl == null || originUrl.length() == 0) originUrl = displayContent;
instance.setExt2(originUrl);
} else if (displayContent == null || displayContent.length() == 0) {
if (instance.getAll().contains("class") && instance.getAll().contains("title=\"[")) {
instance.setType(ContentFragment.TypeFace);
instance.setExt1(getFaceTitle(instance.getAll()));
} else if (instance.getAll().contains("meta")) {
instance.setType(ContentFragment.TypePic);
Map<String, String> temp = getRetwitterPic(instance.getAll());
instance.setExt1(temp.get("w"));
instance.setExt2(temp.get("h"));
instance.setExt3(temp.get("url"));
}
} else if (displayContent.length() < 4 && instance.getAll().contains("pic_icon")) {
instance.setType(ContentFragment.TypePic_);
} else {
instance.setExt1(deleteHtmlTag(instance.getAll()));
}
}
}

private static Map<String, String> getRetwitterPic(String content) {
Map<String, String> result = new HashMap<String, String>();
String w = "", h = "", url = "";
int pos = content.indexOf("url=\"");
if (pos > 0) {
String ret = content.substring(pos + 5);
pos = ret.indexOf("\"");
if (pos > 0) url = ret.substring(0, pos);
}
pos = content.indexOf(" h=\"");
if (pos > 0) {
String ret = content.substring(pos + 4);
pos = ret.indexOf("\"");
if (pos > 0) h = ret.substring(0, pos);
}
pos = content.indexOf(" w=\"");
if (pos > 0) {
String ret = content.substring(pos + 4);
pos = ret.indexOf("\"");
if (pos > 0) w = ret.substring(0, pos);
}
result.put("w", w);
result.put("h", h);
result.put("url", url);
return result;
}


private static String getUrlTitle(String all) {
if (all == null) return null;

int pos = all.indexOf("title=\"http://");
if (pos != -1) {
all = all.substring(pos + 7);
pos = all.indexOf('"');
if (pos != -1) {
return all.substring(0, pos);
}
}
return null;
}

private static String getFaceTitle(String all) {
if (all == null) return null;

int pos = all.indexOf("title=\"");
if (pos != -1) {
all = all.substring(pos + 7);
pos = all.indexOf('"');
if (pos != -1) {
return all.substring(0, pos);
}
}
return null;
}

private static String getAtName(String content) {
if (content == null) return "";
if (content.startsWith("@")) content = content.substring(1);
return content;
}

private static String getHuati(String content) {
if (content == null) return "";
if (content.startsWith("#") && content.endsWith("#")) content = content.substring(1, content.length() - 1);
return content;
}

public static String deleteHtmlTag(String content) {
if (content == null) return "";

int i = 0, j = content.length();
boolean inTag = false;
StringBuilder stringBuilder = new StringBuilder();
for (; i < j; i++) {
char ch = content.charAt(i);
if (inTag == false) {
if (ch == '<') inTag = true;
else stringBuilder.append(ch);
} else if (ch == '>') {
inTag = false;
}
}
return stringBuilder.toString();
}

public static class ContentFragment {
public final static int TypeText = 0;
public final static int TypeHuati = 1;
public final static int TypeAt = 2;
public final static int TypeUrl = 3;
public final static int TypeFace = 4;
public final static int TypePic = 5;
public final static int TypePic_ = 6;
public final static int TypeUndefine = 10;

private int type = TypeText;//0 纯文本内容;1 话题;2 @名字;3 url;4 表情;5 图片
private String all;
private String ext1;
private String ext2;
private String ext3;

private ContentFragment(int type, String all) {
this.type = type;
this.all = all;
}

public int getType() {
return type;
}

public void setType(int type) {
this.type = type;
}

public String getAll() {
return all;
}

public void setAll(String all) {
this.all = all;
}

public String getExt1() {
return ext1;
}

public void setExt1(String ext1) {
this.ext1 = ext1;
}

public String getExt2() {
return ext2;
}

public void setExt2(String ext2) {
this.ext2 = ext2;
}

public String getExt3() {
return ext3;
}

public void setExt3(String ext3) {
this.ext3 = ext3;
}

@Override
public String toString() {
return "ContentFragment{" +
"type=" + type +
", all='" + all + '\'' +
", ext1='" + ext1 + '\'' +
", ext2='" + ext2 + '\'' +
", ext3='" + ext3 + '\'' +
'}';
}
public static String aaa(String con,String reg,String replace) {

if(null == con){
return "";
}
String ret = con;
String input = con;
try {
PatternCompiler compiler = new Perl5Compiler();
PatternMatcher matcher = null;
org.apache.oro.text.regex.Pattern pattern = null;
pattern = compiler.compile(reg,Perl5Compiler.CASE_INSENSITIVE_MASK);
matcher = new Perl5Matcher();
if (matcher.contains(input, pattern)) {
Perl5Substitution sub = new Perl5Substitution(replace,Perl5Substitution.INTERPOLATE_ALL);
ret = Util.substitute(matcher, pattern, sub, input, Util.SUBSTITUTE_ALL);
}
} catch (Exception e) {
e.printStackTrace();
return ret ;
}
return ret;
}

public static String delStatusByReg(String con,String reg,String replace) {

if(null == con){
return "";
}
String ret = con;
String input = con;
try {
PatternCompiler compiler = null;
PatternMatcher matcher = null;
Perl5Substitution sub = null;
org.apache.oro.text.regex.Pattern pattern = null;

compiler = new Perl5Compiler();
pattern = compiler.compile(reg,Perl5Compiler.CASE_INSENSITIVE_MASK);
matcher = new Perl5Matcher();
if (matcher.contains(input, pattern)) {
sub = new Perl5Substitution(replace,Perl5Substitution.INTERPOLATE_ALL);
ret = Util.substitute(matcher, pattern, sub, input, Util.SUBSTITUTE_ALL);
}
} catch (Exception e) {
e.printStackTrace();
return ret ;
}
return ret;
}
public static String regStatusTopic = "<a href=\".*?\" target=\"_blank\">#(.*?)#</a>";
//@
public static String regStatusAT = "<b class=\"nm\"><a href=\".*?\" target=\"_blank\" data-content='{\"type\":\"nick\",\"nick\":\".*?\"}' title=\".*?\"><i class=\"at\">@</i>(.*?)</a></b>";
//url
public static String regStatusURL = "<a href=\".*?\" target=\"_blank\" data-content='{\"type\":\"url\"}' title=\"(.*?)\">(.*?)</a> <a href=\".*?\" target=\"_blank\" data-content='{\"type\":\"url\"}' title=\"http://blog.s135.com/category/19/\">http://t.itc.cn/rFLLE</a>";
//表情
public static String regStatusEm = "<i class=\".*?\" title=\"(.*?)\"></i>";


// public static String regStatusTopic = "<\\{\"data\":\\{\"topic\":\"(.*?)\"\\},\"type\":\"topic\"\\}>";
// //@
// public static String regStatusAT = "<\\{\"data\":\\{\"name\":\"(.*?)\"\\},\"type\":\"at\"\\}>";
// //url
// public static String regStatusURL = "<\\{\"data\":\\{\"short\":\"(http://.*?)\",\"long\":\"(http://.*?)\"\\},\"type\":\"url\"\\}>";
// //表情
// public static String regStatusEm = "<\\{\"data\":\\{\"em\":\"(\\[.*?\\])\"\\},\"type\":\"em\"\\}>";

//转发带图
//public static String regStatusImg = "<\\{\"data\":\\{(\"w\":\"[\\d]*\",\"h\":\"[\\d]*\",)?\"url\":\"(http://.*?)\"\\},\"type\":\"pic\"\\}>";
//转发带图(中间内容部分)
public static String regStatusImg = "<\\{\"data\":\\{(.*?)\"url\":\"(http://.*?)\"(.*?)\\},\"type\":\"pic\"\\}>";
//转发带图(Ext部
public static String regStatusExtImg = "\\{\"last_twitter_id\":[\\d]*,\"richfeed_2_url\":\"(http://.*?)\",(.*?)\\}";
//换行
public static String regChangeLine = "<\\{\"data\":\\{\\},\"type\":\"nl\"\\}>";

public static String replaceTopic = "<{\"data\":{\"topic\":\"$1\"},\"type\":\"topic\"}>";
public static String replaceAT = "<{\"data\":{\"name\":\"$1\"},\"type\":\"at\"}>";
public static String replaceURL = "<{\"data\":{\"short\":\"$2\",\"long\":\"$1\"},\"type\":\"url\"}>";
public static String replaceEm = "<{\"data\":{\"em\":\"$1\"},\"type\":\"em\"}>";

// public static String replaceTopic = "#$1#";
// public static String replaceAT = "@$1 ";
// public static String replaceURL = "$2";
// public static String replaceEm = "[$1]";


public static void main(String[] args) throws UnsupportedEncodingException, MalformedInputException{

String con = "da <a href=\"/ht/daaa\" target=\"_blank\">#daaa#</a>这个世界 <b class=\"nm\"><a href=\"/n/%E9%9D%92%E8%9B%99%E7%89%9B%E4%BB%94\" target=\"_blank\" data-content='{\"type\":\"nick\",\"nick\":\"青蛙牛仔\"}' title=\"青蛙牛仔\"><i class=\"at\">@</i>青蛙牛仔</a></b> 内容长又长 <a href=\"http://t.itc.cn/redKt\" target=\"_blank\" data-content='{\"type\":\"url\"}' title=\"http://blog.s135.com/category/19/\">http://t.itc.cn/redKt</a> <a href=\"http://t.itc.cn/rFLLE\" target=\"_blank\" data-content='{\"type\":\"url\"}' title=\"http://blog.s135.com/category/19/\">http://t.itc.cn/rFLLE</a> 出现一些干部娈童案,我转评了,有网友说,国外神职人员也有啊.是的,每一个群体中都有,但与恶劣比例最大的,全世界只有一个准神职组织,而这个组织可能由你我供养着,我们要使它纯洁,就得多批评,使其阳光公开,使其受到真正的监督.外国的媒体会报道外国神职人员,我们得有我们的责任与权利意识.<i class=\"x x40\" title=\"[给力]\"></i><i class=\"x x185\" title=\"[不懂]\"></i>";
// String con = "da <{\"data\":{\"topic\":\"daaa\"},\"type\":\"topic\"}>这个世界 <{\"data\":{\"name\":\"青蛙牛仔\"},\"type\":\"at\"}> 内容长又长 <{\"data\":{\"short\":\"http://t.itc.cn/redKt\",\"long\":\"http://blog.s135.com/category/19/\"},\"type\":\"url\"}> <{\"data\":{\"short\":\"http://t.itc.cn/rFLLE\",\"long\":\"http://blog.s135.com/category/19/\"},\"type\":\"url\"}> 出现一些干部娈童案,我转评了,有网友说,国外神职人员也有啊.是的,每一个群体中都有,但与恶劣比例最大的,全世界只有一个准神职组织,而这个组织可能由你我供养着,我们要使它纯洁,就得多批评,使其阳光公开,使其受到真正的监督.外国的媒体会报道外国神职人员,我们得有我们的责任与权利意识.<{\"data\":{\"em\":\"[给力]\"},\"type\":\"表情\"}><{\"data\":{\"em\":\"[不懂]\"},\"type\":\"em\"}>";
String ret = null;
long start = System.currentTimeMillis();
for(int i=0;i<100000;i++){
// ret = ConvertMsgContent.formatAndSeriesTag(con);
// ret = con;
con = delStatusByReg(con,regStatusTopic,replaceTopic);
//@处理成@xxx
con = delStatusByReg(con,regStatusAT,replaceAT);
// //url处理成url
// con = delStatusByReg(con,regStatusURL,replaceURL);
// //表情处理成[xx]
// con = delStatusByReg(con,regStatusEm,replaceEm);
// //换行或空格( )处理成空格
// con = delStatusByReg(con,regChangeLine," ");
// //转发带图处理成[url]
// con = delStatusByReg(con,regStatusImg,"[$2]");
}
ret = con;
long end = System.currentTimeMillis();
System.out.println("ret:"+ret);
System.out.println("耗时:"+(end-start)+"毫秒");
// String str = "¥";
// char[] arr = str.toCharArray();
// System.out.println(arr.length);
// System.out.println(arr[0]);
// System.out.println((int)str.toCharArray()[0]);
//"iso-8859-1"
// byte[] bs = str.getBytes();
// System.out.println(bs);
// for(int i=0;i<bs.length;i++){
// System.out.println(bs[i]);
// System.out.println(Integer.toHexString(bs[i]));
// System.out.println(Integer.toOctalString(bs[i]));
// System.out.println(Integer.toBinaryString(bs[i]));
// }
// String encoding ="gb2312";
// byte b[] = {(byte)'\u00c4',(byte)'\u00e3'};
// ByteToCharConverter converter = ByteToCharConverter.getConverter(encoding);
// char c[] = converter.convertAll(b);
// for (int i =0; i < c.length; i++) {
// System.out.println(Integer.toHexString(c[i]));
// }
}
}

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值