读取xml时使用jsoup简单方便, 但是一旦xml过大, 使用jsoup就无法parse, 会出现java heap溢出
使用字符串的方式读取,可以按照标签选择读取范围
List<String> xxxList = new ArrayList<String>();
StringBuilder sb = new StringBuilder();
InputStreamReader reader = new InputStreamReader(
new FileInputStream(xmlFile), "utf-8");
BufferedReader bufferedReader = new BufferedReader(reader);
String line = "";
while ((line = bufferedReader.readLine()) != null) {
sb.append(line + "\n");
if(line.contains("</xxx>")){
xxxList.add(sb.toString());
sb.setLength(0);
}
}
reader.close();
bufferedReader.close();
for (String xxx : xxxList) {
String tag = StringHelper.GetInnerText(xxx, "子标签名").trim();
}
每次内存只加载到<xxx></xxx>之间的内容, 解约内存
package CodingHelper;
import java.net.URLDecoder;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Hashtable;
import java.util.List;
import java.util.regex.*;
public class StringHelper {
public static String RegexDelimitter = "[。?!,、;:“\”‘’()─…—·《》〈〉﹄﹃﹂﹁〔〕【】~@#$%^&*_+{}|<>`=[]\\[\\]./ ̄¥-"'\\s~!@#$%^&*()_+{}|:\"<>?`=;',./–-]";
public static String RegexDelimitterNoSpace = "[。?!,、;:“\”‘’()─…—·《》〈〉﹄﹃﹂﹁〔〕【】~@#$%^&*_+{}|<>`=[]\\[\\]./ ̄¥-"'~!@#$%^&*()_+{}|:\"<>?`=;',./–-]";
public static Pattern RegSpace = Pattern.compile("\\s+");
public static Pattern RegPrep = Pattern.compile(
"\\s+(and|&|of|for|at|in|on|the|to|al|de|et|und)\\s+",
Pattern.CASE_INSENSITIVE);
public static Pattern RegPrep2 = Pattern.compile("与|及|的",
Pattern.CASE_INSENSITIVE);
public static Pattern RegPrep3 = Pattern.compile(
"\\s+(and|&|of|for|at|in|on|the|to|al|de|et|und)[A-Z]",
Pattern.CASE_INSENSITIVE);
public static Pattern RegIsPrep = Pattern.compile(
"^(and|&|of|for|at|in|on|the|to|al|de|et|und)$",
Pattern.CASE_INSENSITIVE);
public static Pattern RegHtmlDecode = Pattern.compile("&.{2,6}?;",
Pattern.CASE_INSENSITIVE);
public static Pattern RegCtrlChars = Pattern.compile("[\\u0001-\\u001f]");// 禁止转义
public static Pattern RegComma = Pattern.compile("(,|;|-)");// 禁止转义
public static Pattern RegDelimit = Pattern.compile(RegexDelimitter);
public static Pattern RegDelimitNoSpace = Pattern
.compile(RegexDelimitterNoSpace);
public static byte[] Int32ToByteArray(int number) {
byte[] buffer = new byte[4];
buffer[0] = (byte) (number >> 24);
buffer[1] = (byte) ((number >> 16) & 0xff);
buffer[2] = (byte) ((number >> 8) & 0xff);
buffer[3] = (byte) (number & 0xff);
return buffer;
}
public static int ByteArrayToInt32(byte[] buffer, int offset) {
int number = buffer[offset] << 24;
number |= buffer[offset + 1] << 16;
number |= buffer[offset + 2] << 8;
number |= buffer[offset + 3];
return number;
}
// / <summary>
// / 将句子转化为字符串
// / </summary>
// / <param name="sText"></param>
// / <param name="bNoPrep">是否删除介词</param>
// / <param name="bNoDelim">是否删除符号</param>
// / <param name="sType">simple=去除空白字符,wsort=按单词排序,csort=按字母排序</param>
// / <returns></returns>
public static String SentenceToString(String sText, boolean bNoPrep,
boolean bNoDelim, String sType) {
Matcher RegHtmlDecode1 = RegHtmlDecode.matcher(sText);
if (RegHtmlDecode1.find()) {
HtmlDecode(sText);
}
Matcher RegComma1 = RegComma.matcher(sText);
if (RegComma1.find()) {
sText = RegComma1.replaceAll(" ");
}
if (bNoPrep) {
RemovePreposition(sText);
}
Matcher RegDelimitNoSpace1 = RegDelimitNoSpace.matcher(sText);
if (bNoDelim && RegDelimitNoSpace1.find()) {
sText = RegDelimitNoSpace1.replaceAll("");
}
Matcher RegDelimit1 = RegDelimit.matcher(sText);
if (sType == null || sType.equals("") || sType == "simple") {
if (RegDelimit1.find()) {
sText = RegDelimit1.replaceAll("");
}
return sText.toLowerCase();
}
StringBuilder sContent = new StringBuilder(8192);
if (sType == "wsort") {
String sepstr = " ";
String[] sArray1 = sText.split(sepstr);
String[] sArrsy = null;
List<String> lstText = new ArrayList<String>();
int m = 0;
for (int i = 0; i < sArray1.length; i++) {
if (sArray1[i] != "") {
sArrsy[m] = sArray1[i];
lstText.add(sArray1[i]);
m++;
}
}
Collections.sort(lstText);
for (int i = 0; i < lstText.size(); i++) {
sContent.append(lstText.get(i));
}
return sContent.toString().toLowerCase();
}
if (RegDelimit1.find()) {
sText = RegDelimit1.replaceAll("");
}
char[] cArray = sText.toCharArray();
List<Character> lstChars = new ArrayList<Character>();
for (int i = 0; i < cArray.length; i++) {
lstChars.add(cArray[i]);
}
Collections.sort(lstChars);
for (int i = 0; i < lstChars.size(); i++) {
sContent.append(lstChars.get(i));
}
return sContent.toString().toLowerCase();
}
// / <summary>
// / 删除句子中的介词
// / </summary>
// / <param name="sText"></param>
// / <returns></returns>
public static boolean RemovePreposition(String sText) {
Matcher RegPrep1 = RegPrep.matcher(sText);
if (RegPrep1.find()) {
sText = RegPrep1.replaceAll(" ");
return true;
}
Matcher RegPrep21 = RegPrep2.matcher(sText);
if (RegPrep21.find()) {
sText = RegPrep21.replaceAll("");
return true;
}
Matcher RegPrep31 = RegPrep3.matcher(sText);
if (RegPrep31.find()) {
String left = "", right = "";
Matcher mth = RegPrep3.matcher(sText);
while (mth.matches()) {
left = "";
if (mth.start() > 0) {
left = sText.substring(0, mth.start());
}
right = sText.substring(mth.start() + mth.groupCount() - 1);
sText = left + " " + right;
mth = RegPrep3.matcher(sText);
}
return true;
}
return false;
}
// / <summary>
// / 删除控制字符
// / </summary>
// / <param name="sText"></param>
// / <returns></returns>
public static boolean RemoveControlChars(String sText) {
Matcher RegCtrlChars1 = RegCtrlChars.matcher(sText);
if (RegCtrlChars1.find()) {
sText = RegCtrlChars1.replaceAll("");
return true;
}
return false;
}
// / <summary>
// / 替换HTML特殊字符
// / </summary>
// / <param name="sText"></param>
public static void HtmlDecode(String sText) {
Matcher RegHtmlDecode1 = RegHtmlDecode.matcher(sText);
if (RegHtmlDecode1.find()) {
sText = URLDecoder.decode(sText);
}
}
// / <summary>
// / 解析数据值
// / </summary>
// / <param name="sType"></param>
// / <param name="sText"></param>
// / <returns></returns>
public static Object ParseValue(String sType, String sText) {
Object objRet = null;
int casechose = 0;
if (sType.equals("int"))
casechose = 1;
if (sType.equals("long"))
casechose = 2;
if (sType.equals("float"))
casechose = 3;
if (sType.equals("double"))
casechose = 4;
if (sType.equals("char"))
casechose = 5;
if (sType.equals("String"))
casechose = 6;
switch (casechose) {
case 1:
objRet = Integer.parseInt(sText);
break;
case 2:
objRet = Long.parseLong(sText);
break;
case 3:
objRet = Float.parseFloat(sText);
break;
case 4:
objRet = Double.parseDouble(sText);
break;
case 5:
objRet = sText.toCharArray();// 返回char[]数组
break;
case 6:
objRet = sText;
break;
}
return objRet;
}
// / <summary>
// / 判断是否为数字串
// / </summary>
// / <param name="line"></param>
// / <returns></returns>
public static boolean IsNumberString(String line) {
Pattern rx = Pattern.compile("^\\d+$");
Matcher rx1 = rx.matcher(line);
return rx1.find();
}
static public int HexStr2Int(String line) {
line = line.toUpperCase();
if (line.startsWith("0X")) {
line = line.substring(2);
}
int temp = 0, result = 0;
char[] cArray = line.toCharArray();
for (int i = 0; i < cArray.length; i++) {
if (cArray[i] >= '0' && cArray[i] <= '9') {
temp = (int) (cArray[i] - '0');
} else {
temp = (int) (cArray[i] - 'A') + 10;
}
result = (result << 4) + temp;
}
return result;
}
// / <summary>
// / 转换字节数组到16进制字符串
// / </summary>
// / <param name="array"></param>
// / <returns></returns>
static public String ToHexString(byte[] array) {
StringBuilder str = new StringBuilder(1024);
for (int i = 0; i < array.length; i++) {
int v = array[i] & 0xFF;
String hv = Integer.toHexString(v);
String strr = MessageFormat.format("{0}", hv);
str.append(strr);
}
return str.toString();
}
static public String ToHexString2(byte[] array) {
StringBuilder str = new StringBuilder(1024);
for (int i = 0; i < array.length; i++) {
int v = array[i] & 0xFF;
String hv = Integer.toHexString(v);
if (hv.length() == 1) {
hv = '0' + hv;
}
String strr = MessageFormat.format("{0}", hv);
str.append(strr);
}
return str.toString();
}
public static byte[] FromHexString(String sText) {
int len = sText.length() / 2;
byte[] buffer = new byte[len];
String line = "";
for (int i = 0; i < len; i++) {
line = sText.substring(2 * i, 2);
buffer[i] = (byte) Integer.parseInt(line);
}
return buffer;
}
// / <summary>
// / 删除分隔符
// / </summary>
// / <param name="sText"></param>
// / <param name="sArray"></param>
// / <returns></returns>
public static String RemoveDelimiter(String sText, String[] sArray) {
StringBuilder str = new StringBuilder(sText);
String strr = null;
for (int i = 0; i < sArray.length; i++) {
strr = str.toString().replace(sArray[i], "");
}
return strr;
}
// / <summary>
// / 删除分隔符
// / </summary>
// / <param name="sText"></param>
// / <param name="sRepPunct">正则表达式,默认为RegexDelimitter</param>
// / <returns></returns>
public static String RemoveDelimiter(String sText, String sRepPunct) {
if (sRepPunct == null || sRepPunct.equals("")
|| sRepPunct == RegexDelimitter) {
return sText.replaceAll(RegDelimit.toString(), "");
}
// return Regex.Replace(sText, sRepPunct, "", RegexOptions.None);
Pattern sRepPunctpatt = Pattern.compile(sRepPunct);
Matcher sRepPunctpatt1 = sRepPunctpatt.matcher(sText);
if (sRepPunctpatt1.find()) {
sText = sRepPunctpatt1.replaceAll("");
return sText;
}
return sText;
}
// / <summary>
// / 用正则表达式替换字符串内容
// / </summary>
// / <param name="sText"></param>
// / <param name="sRepPunct">正则表达式</param>
// / <param name="sReplace">替换字符串</param>
// / <returns></returns>
public static String ReplaceContent(String sText, String sRegex,
String sReplace) {
// return Regex.Replace(sText, sRegex, sReplace, RegexOptions.None);
Pattern sRegexpatt = Pattern.compile(sRegex);
Matcher sRegexpatt1 = sRegexpatt.matcher(sText);
if (sRegexpatt1.find()) {
sText = sRegexpatt1.replaceAll(sReplace);
return sText;
}
return sText;
}
// / <summary>
// / 添加GET请求或POST请求的属性值
// / </summary>
// / <param name="sText"></param>
// / <param name="sAttr"></param>
// / <param name="sValue"></param>
public static void AddValue(String sText, String sAttr, String sValue) {
String line = MessageFormat.format("{0}={1}", sAttr, sValue);
if (sText == null || sText.equals("")) {
sText = line;
return;
}
String str = "";
int pos1 = 0, pos2 = 0;
pos1 = sText.indexOf('?');
if (pos1 > 0) {
str = sText.substring(pos1 + 1);
if (str == null || str.equals("")) {
sText += line;
} else {
sText += "&" + line;
}
return;
}
sText += "&" + line;
}
// / <summary>
// / 设置GET请求或POST请求的属性值
// / </summary>
// / <param name="sText"></param>
// / <param name="sAttr"></param>
// / <param name="sValue"></param>
public static boolean SetValue(String sText, String sAttr, String sValue) {
boolean bFirst = true;
int pos1 = 0, pos2 = 0;
String str = MessageFormat.format("&{0}=", sAttr);
pos1 = sText.indexOf(str);
if (pos1 < 0) {
str = MessageFormat.format("?{0}=", sAttr);
pos1 = sText.indexOf(str);
if (pos1 < 0) {
AddValue(sText, sAttr, sValue);
return true;
}
bFirst = false;
}
pos2 = sText.indexOf("&", pos1 + str.length());
if (pos2 < 0) {
str = sText.substring(pos1);
} else {
str = sText.substring(pos1, pos2 - pos1);
}
String line = "";
if (bFirst) {
line = MessageFormat.format("&{0}={1}", sAttr, sValue);
} else {
line = MessageFormat.format("?{0}={1}", sAttr, sValue);
}
sText = sText.replace(str, line);
return true;
}
// / <summary>
// / 获取GET请求或POST请求的属性值
// / </summary>
// / <param name="sText"></param>
// / <param name="sAttr"></param>
// / <param name="sValue"></param>
// / <returns></returns>
public static boolean ParseValue(String sText, String sAttr, String sValue) {
int pos1 = 0, pos2 = 0;
String str = MessageFormat.format("&{0}=", sAttr);
pos1 = sText.indexOf(str);
if (pos1 < 0) {
str = MessageFormat.format("{0}=", sAttr);
pos1 = sText.indexOf(str);
if (pos1 < 0) {
return false;
}
}
pos1 += str.length();
pos2 = sText.indexOf('&', pos1);
if (pos2 < 0) {
sValue = sText.substring(pos1);
} else {
sValue = sText.substring(pos1, pos2 - pos1);
}
return true;
}
// / <summary>
// / 获取属性值
// / </summary>
// / <param name="line"></param>
// / <param name="sAttr"></param>
// / <param name="sLink"></param>
// / <returns></returns>
public static boolean ParseAttribute(String line, String sAttr,
StringBuffer sLink) {
if (sLink.length() != 0) {
sLink.replace(0, sLink.length() - 1, "");
}
if (line.indexOf(" = ") > 0) {
line = line.replace(" = ", "=");
}
boolean bFirst = true;
int pos1 = 0, pos2 = 0;
String str = MessageFormat.format("{0}=\'", sAttr);
pos1 = line.indexOf(str, pos2);
if (pos1 < 0) {
bFirst = false;
str = str = MessageFormat.format("{0}=\"", sAttr);
pos1 = line.indexOf(str, pos2);
if (pos1 < 0) {
str = MessageFormat.format("{0}=", sAttr);
pos1 = line.indexOf(str, pos2);
if (pos1 < 0) {
return false;
}
}
}
pos1 += str.length();
if (bFirst) {
pos2 = line.toString().indexOf('\'', pos1);
} else {
pos2 = line.toString().indexOf('\"', pos1);
}
if (pos2 < 0) {
pos2 = line.toString().indexOf(' ', pos1);
if (pos2 < 0) {
pos2 = line.toString().indexOf('>', pos1);
if (pos2 < 0) {
return false;
}
} else {
int pos3 = line.toString().indexOf('>', pos1);
if (pos3 > 0 && pos3 < pos2) {
pos2 = pos3;
}
}
}
sLink.replace(0, sLink.length() - 1, line.substring(pos1, pos2 - pos1));
return true;
}
// / <summary>
// / 截取字符串开始部分转换成数字,碰到非数字、逗号字符结束
// / </summary>
// / <param name="sText"></param>
// / <returns></returns>
static public int ToInteger(String sText) {
int value = 0;
// if (Int32.TryParse(sText, value))
// {
// return value;
// }
try {
value = Integer.parseInt(sText);
return value;
} catch (Exception e) {
}
sText = sText.trim();
int index = 0;
char[] array = sText.toCharArray();
for (index = 0; index < array.length; index++) {
if (!(array[index] == ',' || (array[index] >= '0' && array[index] <= '9'))) {
break;
}
}
if (index > 0) {
sText = sText.substring(0, index);
if (sText.indexOf(',') > 0) {
sText = sText.replace(",", "");
}
// Int32.TryParse(sText, value);
try {
value = Integer.parseInt(sText);
} catch (Exception e) {
}
return value;
}
return -1;
}
// / <summary>
// / 抽取格式化XML节点文本
// / </summary>
// / <param name="content"></param>
// / <param name="tagname"></param>
// / <returns></returns>
public static String GetInnerText(String content, String tagname) {
String end = MessageFormat.format("</{0}>", tagname);
String start = MessageFormat.format("<{0}>", tagname);
String str = GetInnerText(content, 0, start, end);
if (str != null && str != "") {
return str;
}
start = MessageFormat.format("<{0} ", tagname);
return GetInnerText(content, 0, start, end);
}
public static String GetInnerText(String sText, int start, String sTagName) {
String sEnd = MessageFormat.format("</{0}>", sTagName);
String sBeg = MessageFormat.format("<{0}>", sTagName);
String str = GetInnerText(sText, start, sBeg, sEnd);
if (str != null && str != "") {
return str;
}
sBeg = MessageFormat.format("<{0} ", sTagName);
return GetInnerText(sText, start, sBeg, sEnd);
}
// / <summary>
// / 获取 start 与 end 间文本
// / </summary>
// / <param name="sText"></param>
// / <param name="sBeg">开始前文本</param>
// / <param name="sEnd">结束后文本</param>
// / <returns>结果文本</returns>
public static String GetInnerText(String sText, String sBeg, String sEnd) {
return GetInnerText(sText, 0, sBeg, sEnd);
}
public static String GetInnerText(String sText, int start, String sBeg,
String sEnd) {
int pos1 = 0, pos2 = 0;
pos1 = sText.indexOf(sBeg, start);
if (pos1 < 0) {
return "";
}
pos1 += sBeg.length();
pos2 = sText.indexOf(sEnd, pos1);
if (pos2 < 0) {
return "";
}
String str = sText.substring(pos1, pos2);
if (!sBeg.endsWith(">")) {
pos1 = str.indexOf('>');
if (pos1 >= 0) {
str = str.substring(pos1 + 1);
}
}
str = str.trim();
if (str.startsWith("<![CDATA[") && str.endsWith("]]>")) {
str = str.substring(9, str.length() - 12);
}
return str;
}
public static int GetInnerText(String sText, int start, String sBeg,
String sEnd, StringBuffer sResult) {
int pos1 = 0, pos2 = 0;
pos1 = sText.indexOf(sBeg, start);
if (pos1 < 0) {
return -1;
}
pos1 += sBeg.length();
pos2 = sText.indexOf(sEnd, pos1);
if (pos2 < 0) {
return -1;
}
String str = sText.substring(pos1, pos2 - pos1);
if (!sBeg.endsWith(">")) {
pos1 = str.indexOf('>');
if (pos1 >= 0) {
str = str.substring(pos1 + 1);
}
}
str = str.trim();
if (str.endsWith("<![CDATA[") && str.endsWith("]]>")) {
str = str.substring(9, str.length() - 12);
}
if (sResult.length() == 0) {
sResult.append(str);
} else {
sResult.replace(0, sResult.length() - 1, str);
}
return (pos2 + sEnd.length());
}
// / <summary>
// / 抽取标签内文本
// / </summary>
// / <param name="sText"></param>
// / <param name="bLast">最后一个标签</param>
// / <param name="start">开始位置</param>
// / <param name="sTag">标签TAG</param>
// / <param name="sValue"></param>
// / <returns>成功返回结束位置,错误返回-1</returns>
public static int ParseFieldValue(String sText, int start, boolean bLast,
String sTag, StringBuffer sValue) {
return GetInnerHTML(sText, start, 1, bLast, sTag, sValue);
}
// / <summary>
// / 抽取标签内文本
// / </summary>
// / <param name="sText"></param>
// / <param name="bLast">最后一个标签</param>
// / <param name="start">开始位置</param>
// / <param name="sTag">标签TAG</param>
// / <param name="sValue"></param>
// / <returns>成功返回结束位置,错误返回-1</returns>
public static int GetInnerHTML(String sText, int start, String sTag,
StringBuffer sValue) {
return GetInnerHTML(sText, start, 1, false, sTag, sValue);
}
// / <summary>
// / 抽取标签内HTML文本
// / </summary>
// / <param name="sText"></param>
// / <param name="start">开始位置</param>
// / <param name="iRepTimes">标签重复次数</param>
// / <param name="sTag">A TABLE P等HTML标签</param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int GetInnerHTML(String sText, int start, int iRepTimes,
String sTag, StringBuffer sResult) {
return GetInnerHTML(sText, start, iRepTimes, false, sTag, sResult);
}
// / <summary>
// / 抽取标签外部HTML文本
// / </summary>
// / <param name="sText"></param>
// / <param name="bReverse">是否反向查找</param>
// / <param name="start">开始查找位置</param>
// / <param name="iRepTimes">标签重复次数</param>
// / <param name="sTag"></param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int GetInnerHTML(String sText, int start, int iRepTimes,
boolean bReverse, String sTag, StringBuffer sResult) {
String sBeg = MessageFormat.format("<{0}", sTag);
String sEnd = MessageFormat.format("</{0}>", sTag);
return GetInnerHTML(sText, start, iRepTimes, bReverse, sBeg, sEnd,
sResult);
}
// / <summary>
// / 抽取标签外部HTML文本
// / </summary>
// / <param name="sText"></param>
// / <param name="start">开始查找位置</param>
// / <param name="iRepTimes">标签重复次数</param>
// / <param name="bReverse">是否反向查找</param>
// / <param name="sBeg">开始文本</param>
// / <param name="sEnd">结束文本</param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int GetInnerHTML(String sText, int start, int iRepTimes,
boolean bReverse, String sBeg, String sEnd, StringBuffer sResult) {
int pos1 = 0, pos2 = 0;
if (!bReverse) {
pos1 = sText.indexOf(sBeg, start);
while (pos1 >= 0 && iRepTimes > 0) {
iRepTimes--;
if (iRepTimes == 0) {
break;
}
pos1 += sBeg.length();
pos1 = sText.indexOf(sBeg, pos1);
}
} else {
pos1 = sText.lastIndexOf(sBeg);
while (pos1 >= 0 && iRepTimes > 0) {
iRepTimes--;
if (iRepTimes == 0) {
break;
}
sText = sText.substring(0, pos1);
pos1 = sText.lastIndexOf(sBeg);
}
}
if (pos1 < start) {
return -1;
}
pos1 += sBeg.length();
pos2 = sText.indexOf(sEnd, pos1);
if (pos2 < 0) {
return -1;
}
sResult.append(sText.substring(pos1, pos2 - pos1));
start = pos2 + sEnd.length();
if (!sBeg.endsWith(">")) {
pos1 = sResult.toString().indexOf('>');
if (pos1 >= 0) {
sResult.append(sResult.substring(pos1 + 1));
}
}
String li = sResult.toString().trim();
sResult.delete(0, sResult.length() - 1);
sResult.append(li);
return start;
}
// / <summary>
// / 获取标签外部HTML串
// / </summary>
// / <param name="sText"></param>
// / <param name="start">开始位置</param>
// / <param name="sTag"></param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int GetOuterHTML(String sText, int start, String sTag,
StringBuffer sResult) {//, StringBuffer dResult
String sBeg = MessageFormat.format("<{0}", sTag);
String sEnd = MessageFormat.format("</{0}>", sTag);
// String sDate = ";;";
int pos1 = GetOuterHTML(sText, start, 1, false, sBeg, sEnd, sResult);//, sDate, dResult
if (pos1 > 0) {
return pos1;
}
sEnd = ">";
return GetOuterHTML(sText, start, 1, false, sBeg, sEnd, sResult);//, sDate, dResult
}
public static int GetOuterHTML(String sText, int start, int iRepTimes,
String sTag, StringBuffer sResult) {//, String sDate, StringBuffer dResult
String sBeg = MessageFormat.format("<{0}", sTag);
String sEnd = MessageFormat.format("</{0}>", sTag);
int pos1 = GetOuterHTML(sText, start, iRepTimes, false, sBeg, sEnd,
sResult);//, sDate, dResult
if (pos1 > 0) {
return pos1;
}
sEnd = ">";
return GetOuterHTML(sText, start, iRepTimes, false, sBeg, sEnd, sResult);//, sDate, dResult
}
public static int GetOuterHTML(String sText, int start, int iRepTimes,
boolean bReverse, String sTag, StringBuffer sResult) {//, String sDate, StringBuffer dResult
String sBeg = MessageFormat.format("<{0}", sTag);
String sEnd = MessageFormat.format("</{0}>", sTag);
int pos1 = GetOuterHTML(sText, start, iRepTimes, false, sBeg, sEnd,
sResult);//, sDate, dResult
if (pos1 > 0) {
return pos1;
}
sEnd = ">";
return GetOuterHTML(sText, start, iRepTimes, false, sBeg, sEnd, sResult);//, sDate, dResult
}
// / <summary>
// / 获取标签外部HTML文本
// / </summary>
// / <param name="sText"></param>
// / <param name="start">开始查找位置</param>
// / <param name="iRepTimes">重复出现次数</param>
// / <param name="bReverse">是否反向匹配</param>
// / <param name="sBeg">开始文本</param>
// / <param name="sEnd">结束文本</param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int GetOuterHTML(String sText, int start, int iRepTimes,
boolean bReverse, String sBeg, String sEnd, StringBuffer sResult) {//, String sDate, StringBuffer dResult
int pos1 = 0, pos2 = 0;
if (!bReverse) {
pos1 = sText.indexOf(sBeg, start);
while (pos1 >= 0 && iRepTimes > 0) {
iRepTimes--;
if (iRepTimes == 0) {
break;
}
pos1 += sBeg.length();
pos1 = sText.indexOf(sBeg, pos1);
}
} else {
pos1 = sText.lastIndexOf(sBeg);
while (pos1 >= 0 && iRepTimes > 0) {
iRepTimes--;
if (iRepTimes == 0) {
break;
}
sText = sText.substring(0, pos1);
pos1 = sText.lastIndexOf(sBeg);
}
}
if (pos1 < start) {
return -1;
}
pos2 = sText.indexOf(sEnd, pos1 + sBeg.length());
if (pos2 < 0) {
return -1;
}
pos2 += sEnd.length();
if (sResult.length() == 0) {
sResult.append(sText.substring(pos1, pos2));
} else {
sResult.replace(0, sResult.length() - 1,
sText.substring(pos1, pos2));
}
// 添加获取服务器时间
// if(dResult == null || sDate == null){
// return pos2;
// }
// pos1 = sText.indexOf(sDate, start);
// if(pos1<0){
// System.out.println(sText + "----" + start);
// }
// if (dResult.length() == 0) {
// dResult.append(sText.substring(start, pos1));
// } else {
// dResult.replace(0, dResult.length() - 1,
// sText.substring(start, pos1));
// }
return pos2;
}
// / <summary>
// / 获取HTML页面Meta节点内容
// / </summary>
// / <param name="sText"></param>
// / <param name="start"></param>
// / <param name="sTitle">节点名称</param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int GetMetaContent(String sText, int start, String sTitle,
StringBuffer sResult) {
StringBuffer line = new StringBuffer();
int pos1 = StringHelper.ExtractContentByAttr(sText, start, 0, "meta",
"name", sTitle, line);
if (pos1 < 0) {
return -1;
}
if (!StringHelper.ParseAttribute(line.toString(), "content", sResult)) {
return -1;
}
return pos1;
}
// / <summary>
// / 查找标签内部文本
// / </summary>
// / <param name="sText"></param>
// / <param name="start">开始查找位置</param>
// / <param name="sTag">A P TABLE table等页面标签</param>
// / <param name="sResult"></param>
// / <returns>查找内容结束后位置,-1为查找失败</returns>
public static int FindString(String sText, int start, String sTag,
StringBuffer sResult) {
String sBegin = MessageFormat.format("<{0}", sTag);
String sEnd = MessageFormat.format("</{0}>", sTag);
return FindString(sText, start, sBegin, sEnd, sResult);
}
public static int FindString(String sText, int start, int iRepTimes,
String sTag, StringBuffer sResult) {
String sBegin = MessageFormat.format("<{0}", sTag);
String sEnd = MessageFormat.format("</{0}>", sTag);
return FindString(sText, start, iRepTimes, sBegin, sEnd, sResult);
}
// / <summary>
// / 查找sBegin 与 sEnd 间文本
// / </summary>
// / <param name="sText"></param>
// / <param name="start">开始位置</param>
// / <param name="sBegin">开始前文本</param>
// / <param name="sEnd">结束后文本</param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int FindString(String sText, int start, String sBegin,
String sEnd, StringBuffer sResult) {
int pos1 = 0, pos2 = 0;
pos1 = sText.indexOf(sBegin, start);
if (pos1 < 0) {
return -1;
}
pos1 += sBegin.length();
start = pos1;
if (sEnd == null || sEnd.equals("")) {
if (sResult.length() == 0) {
sResult.append(sText.toString().substring(pos1));
} else {
sResult.replace(0, sResult.length() - 1, sText.toString()
.substring(pos1));
}
} else {
pos2 = sText.indexOf(sEnd, pos1);
if (pos2 < 0) {
return -1;
}
sResult.replace(0, sResult.length() - 1,
sText.substring(pos1, pos2 - pos1));
start = pos2 + sEnd.length();
}
pos1 = sResult.toString().indexOf('>');
if (pos1 >= 0) {
sResult.replace(0, sResult.length() - 1,
sResult.substring(pos1 + 1));
}
sResult.replace(0, sResult.length() - 1, sResult.toString().trim());
return start;
}
// / <summary>
// / 查找 sBegin 和 sEnd 间文本,重复iRepTimes次
// / </summary>
// / <param name="sText"></param>
// / <param name="start"></param>
// / <param name="iRepTimes"></param>
// / <param name="sBegin"></param>
// / <param name="sEnd"></param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int FindString(String sText, int start, int iRepTimes,
String sBegin, String sEnd, StringBuffer sResult) {
do {
start = FindString(sText, start, sBegin, sEnd, sResult);
if (start < 0) {
break;
}
iRepTimes--;
} while (iRepTimes > 0);
return start;
}
// / <summary>
// / 将字符串的中词语的第一个字母大写,其他小写
// / </summary>
// / <param name="sText"></param>
// / <returns></returns>
public static String FirstLetterCapital(String sText) {
sText = sText.trim();
sText = sText.toLowerCase();
boolean bPrevSpace = false;
char[] sArray = sText.toCharArray();
for (int i = 0; i < sArray.length; i++) {
if (i == 0 || bPrevSpace) {
bPrevSpace = false;
if (Character.isLetter(sArray[i])) {
sArray[i] = Character.toUpperCase(sArray[i]);
}
} else {
if (Character.isWhitespace(sArray[i])) {
bPrevSpace = true;
} else if (Character.isSpaceChar(sArray[i]))// *****************
{
bPrevSpace = true;
} else if (Character.getType(sArray[i]) == Character.OTHER_PUNCTUATION)// **********
{
bPrevSpace = true;
} else if (sArray[i] == ':' || sArray[i] == '-') {
bPrevSpace = true;
}
}
}
StringBuilder sb = new StringBuilder();
sb.append(sArray);
sText = sb.toString();
return sText;
}
// / <summary>
// / 判断sText与sValue是否为包含关系
// / </summary>
// / <param name="sText"></param>
// / <param name="bToLower">是否小写后比较</param>
// / <param name="iCompare">1 sText或子串 包含 sValue;0 sText或子串 相等 sValue;-1
// sValue 包含sText或子串</param>
// / <param name="sValue"></param>
// / <param name="splitstr">sText分割串数组</param>
// / <returns></returns>
public static boolean Contains(String sText, boolean bToLower,
int iCompare, String sValue, String splitstr) {
String[] array1 = sText.split(splitstr);
String[] array = null;
int m = 0;
for (int i = 0; i < array1.length; i++) {
if (array1[i] != "") {
array[m] = array1[i];
m++;
}
}
if (array != null && array.length > 0) {
String str = "";
String line = sValue;
if (bToLower) {
line = line.toLowerCase();
}
for (int i = 0; i < array.length; i++) {
if (!bToLower) {
str = array[i];
} else {
str = array[i].toLowerCase();
}
if (iCompare == 0) {
if (line == str) {
return true;
}
} else if (iCompare > 0) {
if (str.indexOf(line) > 0) {
return true;
}
} else {
if (line.indexOf(str) > 0) {
return true;
}
}
}
}
return false;
}
// / <summary>
// / 获取字符串首字母
// / </summary>
// / <param name="str"></param>
// / <returns></returns>
/*
* public static char GetFirstLetter(String text) { if (text == null ||
* text.equals("")) { return '\0'; } text = text.toLowerCase(); if
* (text.startsWith("the ")) { text = text.substring(4).trim(); }
*
* byte[] buffer =
* System.Text.Encoding.GetEncoding("GB2312").GetBytes(text.toCharArray(0,
* 1));
*
* int tmp = 0; if (buffer.length == 1) { tmp = buffer[0]; } else { tmp =
* buffer[1] | (buffer[0] << 8); }
*
* if (tmp >= 45217 && tmp <= 45252) { return 'A'; } if (tmp >= 45253 && tmp
* <= 45760) { return 'B'; } if (tmp >= 45761 && tmp <= 46317) { return 'C';
* } if (tmp >= 46318 && tmp <= 46825) { return 'D'; } if (tmp >= 46826 &&
* tmp <= 47009) { return 'E'; } if (tmp >= 47010 && tmp <= 47296) { return
* 'F'; } if (tmp >= 47297 && tmp <= 47613) { return 'G'; } if (tmp >= 47614
* && tmp <= 48118) { return 'H'; } if (tmp >= 48119 && tmp <= 49061) {
* return 'J'; } if (tmp >= 49062 && tmp <= 49323) { return 'K'; } if (tmp
* >= 49324 && tmp <= 49895) { return 'L'; } if (tmp >= 49896 && tmp <=
* 50370) { return 'M'; } if (tmp >= 50371 && tmp <= 50613) { return 'N'; }
* if (tmp >= 50614 && tmp <= 50621) { return 'O'; } if (tmp >= 50622 && tmp
* <= 50905) { return 'P'; } if (tmp >= 50906 && tmp <= 51386) { return 'Q';
* } if (tmp >= 51387 && tmp <= 51445) { return 'R'; } if (tmp >= 51446 &&
* tmp <= 52217) { return 'S'; } if (tmp >= 52218 && tmp <= 52697) { return
* 'T'; } if (tmp >= 52698 && tmp <= 52979) { return 'W'; } if (tmp >= 52980
* && tmp <= 53640) { return 'X'; } if (tmp >= 53689 && tmp <= 54480) {
* return 'Y'; } if (tmp >= 54481 && tmp <= 62289) { return 'Z'; }
*
* char first = text.charAt(0);
*
* if (first >= 'a' && first <= 'z') { first = (char)(first - ' '); } return
* first; }
*/
// / <summary>
// / 获取最大iMaxWord个单词或字的首字母,形成格式如中国人民-ZGRM,cell research-CR
// / </summary>
// / <param name="sText"></param>
// / <param name="iMaxWord"></param>
// / <returns></returns>
/*
* public static String GetInitialLetters(String sText, int iMaxWord) {
* String line = "", sResult = ""; if (!CultureHelper.IsForeign(sText)) {
* for (int i = 0; i < sText.length() && i < iMaxWord; i++) { char ch1 =
* sText.charAt(i); char ch2 =
* StringHelper.GetFirstLetter(String.valueOf(ch1)); if (ch1 == ch2) {
* continue; } sResult += ch2; } } else { String[] sepstr = { ";;", ";",
* ",", " " }; String[] sArray = sText.Split(sepstr,
* StringSplitOptions.RemoveEmptyEntries); if (sArray == null ||
* sArray.length <= 0) { return ""; }
*
* for (int i = 0; i < sArray.length && i<iMaxWord; i++) { line = sArray[i];
* Matcher RegIsPrep1 = RegIsPrep.matcher(line); if (RegIsPrep1.find()) {
* continue; } sResult += String.valueOf(StringHelper.GetFirstLetter(line));
* } } return sResult; }
*/
// / <summary>
// / 在sText中查找任意sArray中字符串,并返回位置最前或最后的串位置
// / </summary>
// / <param name="sText"></param>
// / <param name="start">开始查找位置</param>
// / <param name="bMinPos">最前位置 true,最后位置 false</param>
// / <param name="sHitStr">命中字串</param>
// / <param name="sArray"></param>
// / <returns>位置</returns>
public static int IndexOfAny(String sText, int start, boolean bMinPos,
String sHitStr, String[] sArray) {
int pos1 = -1, pos2 = -1;
for (int i = 0; i < sArray.length; i++) {
pos2 = sText.indexOf(sArray[i], start);
if (pos2 > 0) {
if (pos1 == -1) {
sHitStr = sArray[i];
pos1 = pos2;
} else {
if (bMinPos) {
if (pos2 < pos1) {
sHitStr = sArray[i];
pos1 = pos2;
}
} else {
if (pos2 > pos1) {
sHitStr = sArray[i];
pos1 = pos2;
}
}
}
}
}
return pos1;
}
// / <summary>
// / 在文本的每一行前加tabTimes制表符
// / </summary>
// / <param name="sText"></param>
// / <param name="tabTimes"></param>
// / <returns></returns>
public static String TabTextLine(String sText, int tabTimes) {
String str = "\t";
if (tabTimes > 1) {
for (int i = 1; i < tabTimes; i++) {
str += "\t";
}
}
StringBuilder sb = new StringBuilder();
String line = "";
int pos1 = 0, pos2 = 0;
pos1 = sText.indexOf("\n", pos2);
while (pos1 > 0) {
line = sText.substring(pos2, pos1 - pos2);
pos2 = pos1 + 1;
line = str + line;
sb.append(line);
pos1 = sText.indexOf("\n", pos2);
}
if (pos2 < sText.length()) {
line = sText.substring(pos2);
line = str + line;
sb.append(line);
}
return sb.toString();
}
// / <summary>
// / 删除以sBeg开始、sEnd结尾的文本
// / </summary>
// / <param name="sText"></param>
// / <param name="start"></param>
// / <param name="sBeg"></param>
// / <param name="sEnd"></param>
// / <returns></returns>
public static String RemoveText(String sText, int start, String sBeg,
String sEnd) {
int pos1 = 0, pos2 = 0;
pos1 = sText.indexOf(sBeg, start);
if (pos1 < 0) {
return sText;
}
String left = "", right = "";
while (pos1 >= 0) {
pos2 = sText.indexOf(sEnd, pos1);
if (pos2 < 0) {
break;
}
pos2 += sEnd.length();
left = sText.substring(0, pos1);
right = sText.substring(pos2);
sText = left + right;
pos1 = sText.indexOf(sBeg, start);
}
return sText;
}
public static String RemoveHTML(String sText) {
String left = "", right = "";
int pos2 = 0;
int pos1 = sText.indexOf('<');
while (pos1 >= 0) {
pos2 = sText.indexOf('>', pos1);
if (pos2 < 0) {
sText = sText.substring(0, pos1);
} else {
left = sText.substring(0, pos1);
right = sText.substring(pos2 + 1);
sText = left + right;
}
pos1 = sText.indexOf('<');
}
return sText;
}
// / <summary>
// / 将sText中与sResult不同的文本串合并
// / </summary>
// / <param name="sText"></param>
// / <param name="sepstr"></param>
// / <param name="sResult"></param>
// / <returns></returns>
public static String MergeContent(String sText, String sepstr,
String sResult) {
boolean bChanged = false;
return MergeContent(sText, sepstr, sResult, bChanged);
}
public static String MergeContent(String sText, String sepstr,
String sResult, Boolean bChanged) {
bChanged = false;
if (sText == "" || sText == sResult || sText == ";;") {
return sResult;
}
if (sResult == null || sResult.equals("")) {
sResult = sText;
if (!sResult.endsWith(";;")) {
sResult += ";;";
}
bChanged = true;
return sResult;
}
if (!sResult.endsWith(";;")) {
sResult += ";;";
}
if (sResult.indexOf(";;;") > 0) {
sResult = sResult.replace(";;;", "[xxx];;");
}
String[] sArray1 = sResult.split(sepstr);
String[] sArray = null;
int m = 0;
for (int i = 0; i < sArray1.length; i++) {
if (sArray1[i] != "") {
sArray[m] = sArray1[i];
m++;
}
}
if (sArray == null || sArray.length == 0) {
sResult = sText;
bChanged = true;
return sResult;
}
String line = "";
if (sText.indexOf(";;;") > 0) {
sText = sText.replace(";;;", "[xxx];;");
}
String[] sArray21 = sText.split(sepstr);
String[] sArray2 = null;
int n = 0;
for (int i = 0; i < sArray21.length; i++) {
if (sArray21[i] != "") {
sArray2[n] = sArray21[i];
n++;
}
}
if (sArray2 != null && sArray2.length > 0) {
int pos1 = 0;
Hashtable mytable = new Hashtable();
for (int i = 0; i < sArray.length; i++) {
line = sArray[i].trim();
if (line.indexOf("[xxx]") > 0) {
line = line.replace("[xxx]", ";");
}
if (!mytable.containsKey(line)) {
mytable.put(line, line);
}
}
for (int i = 0; i < sArray2.length; i++) {
line = sArray2[i].trim();
if (line.indexOf("[xxx]") > 0) {
line = line.replace("[xxx]", ";");
}
if (!mytable.containsKey(line)) {
sResult += line + ";;";
bChanged = true;
}
}
}
if (sResult.indexOf("[xxx]") > 0) {
sResult = sResult.replace("[xxx]", ";");
}
return sResult;
}
// / <summary>
// / 根据标签内包含的文本特征提取元素OuterHTML
// / </summary>
// / <param name="sText"></param>
// / <param name="start"></param>
// / <param name="iCompare"></param>
// / <param name="sTagName"></param>
// / <param name="sInnerText"></param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int ExtractContentByInnerText(String sText, int start,
int iCompare, String sTagName, String sInnerText,
StringBuffer sResult) {
String line = "";
StringBuffer sTemp = new StringBuffer();
// StringBuffer dTemp = new StringBuffer();
int pos1 = StringHelper.GetOuterHTML(sText, start, sTagName, sTemp);//, dTemp
while (pos1 > 0) {
line = StringHelper.GetInnerText(sTemp.toString(), sTagName);
if (MatchText(line, iCompare, sInnerText)) {
sResult.append(sTemp);
return pos1;
}
pos1 = StringHelper.GetOuterHTML(sText, pos1, sTagName, sTemp);//, dTemp
}
return -1;
}
// / <summary>
// / 根据元素内文本特征获取元素后面的HTML文本
// / </summary>
// / <param name="sText"></param>
// / <param name="start"></param>
// / <param name="iCompare"></param>
// / <param name="sTagName"></param>
// / <param name="sInnerText"></param>
// / <param name="sEndText"></param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int ExtractContentAfter(String sText, int start,
int iCompare, String sTagName, String sInnerText, String sEndText,
StringBuffer sResult) {
StringBuffer sTemp = new StringBuffer();
int pos1 = ExtractContentByInnerText(sText, start, iCompare, sTagName,
sInnerText, sTemp);
if (pos1 < 0) {
return -1;
}
int pos2 = sText.indexOf(sEndText, pos1);
if (pos2 < 0) {
return -1;
}
pos2 += sEndText.length();
sResult.append(sText.substring(pos1, pos2 - pos1));
return pos2;
}
// / <summary>
// / 根据属性的值提取元素OuterHTML
// / </summary>
// / <param name="sText"></param>
// / <param name="start"></param>
// / <param name="iCompare"></param>
// / <param name="sTagName"></param>
// / <param name="sAttr"></param>
// / <param name="sMatch"></param>
// / <param name="sResult"></param>
// / <returns></returns>
public static int ExtractContentByAttr(String sText, int start,
int iCompare, String sTagName, String sAttr, String sMatch,
StringBuffer sResult) {
StringBuffer line = new StringBuffer();
StringBuffer sTemp = new StringBuffer();
// StringBuffer dTemp = new StringBuffer();
int pos1 = StringHelper.GetOuterHTML(sText, start, sTagName, sTemp);//, dTemp
while (pos1 > 0) {
if (StringHelper.ParseAttribute(sTemp.toString(), sAttr, line)) {
if (MatchText(line.toString(), iCompare, sMatch)) {
sResult.append(sTemp);
return pos1;
}
}
pos1 = StringHelper.GetOuterHTML(sText, pos1, sTagName, sTemp);//, dTemp
}
return -1;
}
// / <summary>
// / 根据iCompare指定方式判断sText与sMatch是否匹配
// / </summary>
// / <param name="sText"></param>
// / <param name="iCompare">0=相等 1=包含 -1=不包含</param>
// / <param name="sMatch"></param>
// / <returns></returns>
public static boolean MatchText(String sText, int iCompare, String sMatch) {
if (iCompare == 0) {
if (sText == sMatch) {
return true;
}
} else if (iCompare > 0) {
if (sText.indexOf(sMatch) >= 0) {
return true;
}
} else {
if (sText.indexOf(sMatch) < 0) {
return true;
}
}
return false;
}
}