此类由KXML进化而来,用于手机解析HTML,XML,TXT,XHTML,WML等文档,支持CDATA,支持Text Extractor
如果在平时的开发用得上它,请保留作者和出处,谢谢!
package Core;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Hashtable;
/**
* @author fonter
* http://fonter.iteye.com
* 此类用于解析HTML,XML,TXT,XHTML,WML等文档,支持CDATA,支持Text Extractor
*/
public class HtmlInputStreamReader {
private Reader reader;
private boolean unresolved;
private boolean processNsp = true;
private boolean token;
private boolean wasCR;
private String encoding;
private char[] srcBuf;
private Hashtable entityMap;
private boolean relaxed = true;
private boolean degenerated;
private String[] attributes = new String[16];
private int type;
static final private String UNEXPECTED_EOF = "Unexpected EOF";
//static final private String ILLEGAL_TYPE = "Wrong event type";
public static final String NO_NAMESPACE = "";
public static final int START_DOCUMENT = 0;
public static final int END_DOCUMENT = 1;
public static final int START_TAG = 2;
public static final int END_TAG = 3;
public static final int TEXT = 4;
public static final int CDSECT = 5;
public static final int ENTITY_REF = 6;
public static final int IGNORABLE_WHITESPACE = 7;
public static final int PROCESSING_INSTRUCTION = 8;
public static final int COMMENT = 9;
public static final int DOCDECL = 10;
public static final int LEGACY = 999;
public static final int XML_DECL = 998;
private String[] nspStack = new String[8];
private int[] nspCounts = new int[4];
private String version;
private Boolean standalone;
private char[] txtBuf = new char[128];
private int txtPos;
private String error;
private int srcLength;
private int srcPos;
private int srcCount;
private int stackMismatch = 0;
private String namespace;
private String prefix;
private String name;
private String[] elementStack = new String[16];
private int line;
private int column;
private int[] peek = new int[2];
private int peekCount;
private boolean isWhitespace;
private int attributeCount;
private int depth;
public HtmlInputStreamReader() throws IOException{
srcBuf = new char[Runtime.getRuntime().freeMemory() >= 1048576 ? 8192 : 128];
}
public void setInput(Reader reader) throws IOException {
this.reader = reader;
line = 1;
column = 0;
type = START_DOCUMENT;
name = null;
namespace = null;
degenerated = false;
attributeCount = -1;
encoding = null;
version = null;
standalone = null;
srcLength = 0;
if (reader == null)
return;
srcPos = 0;
srcCount = 0;
peekCount = 0;
depth = 0;
entityMap = new Hashtable();
entityMap.put("amp", "&");
entityMap.put("apos", "'");
entityMap.put("gt", ">");
entityMap.put("lt", "<");
entityMap.put("quot", "\"");
entityMap.put("copy", "\251");
entityMap.put("reg", "\256");
entityMap.put("yen", "\245");
}
private final int peek(int pos) throws IOException {
while (pos >= peekCount) {
int nw;
if (srcBuf.length <= 1)
nw = reader.read();
else if (srcPos < srcCount)
nw = srcBuf[srcPos++];
else {
srcCount = reader.read(srcBuf, 0, srcBuf.length);
if (srcCount <= 0)
nw = -1;
else
nw = srcBuf[0];
srcPos = 1;
}
if (nw == '\r') {
wasCR = true;
peek[peekCount++] = '\n';
} else {
if (nw == '\n') {
if (!wasCR)
peek[peekCount++] = '\n';
} else
peek[peekCount++] = nw;
wasCR = false;
}
}
return peek[pos];
}
private final int peekType() throws IOException {
switch (peek(0)) {
case -1 :
return END_DOCUMENT;
case '&' :
return ENTITY_REF;
case '<' :
switch (peek(1)) {
case '/' :
return END_TAG;
case '?' :
case '!' :
return LEGACY;
default :
return START_TAG;
}
default :
return TEXT;
}
}
private final void error(String desc){
exception(desc);
}
private final void exception(String desc){
System.out.println(desc);
}
public final void nextImpl() throws IOException{
if (reader == null)
exception("No Input specified");
if (type == END_TAG)
depth--;
while (true) {
attributeCount = -1;
// degenerated needs to be handled before error because of possible
// processor expectations(!)
if (degenerated) {
degenerated = false;
type = END_TAG;
return;
}
if (error != null) {
for (int i = 0; i < error.length(); i++)
push(error.charAt(i));
// text = error;
error = null;
type = COMMENT;
return;
}
if (relaxed
&& (stackMismatch > 0 || (peek(0) == -1 && depth > 0))) {
int sp = (depth - 1) << 2;
type = END_TAG;
namespace = elementStack[sp];
prefix = elementStack[sp + 1];
name = elementStack[sp + 2];
if (stackMismatch != 1)
error = "missing end tag /" + name + " inserted";
if (stackMismatch > 0)
stackMismatch--;
return;
}
prefix = null;
name = null;
namespace = null;
// text = null;
type = peekType();
//System.out.println("Markup:"+type);
switch (type) {
case ENTITY_REF :
pushEntity();
return;
case START_TAG :
parseStartTag(false);
return;
case END_TAG :
parseEndTag();
return;
case END_DOCUMENT :
return;
case TEXT :
pushText('<', !token);
if (depth == 0) {
if (isWhitespace)
type = IGNORABLE_WHITESPACE;
// make exception switchable for instances.chg... !!!!
// else
// exception ("text '"+getText ()+"' not allowed outside root element");
}
return;
default :
type = parseLegacy(token);
if (type != XML_DECL)
return;
}
}
}
// boolean isEND(){
// return isEOF;
//}
public String getInputEncoding() {
return encoding;
}
public String getText() {
return type < TEXT
|| (type == ENTITY_REF && unresolved) ? null : get(0);
}
//text Extractor
public String getTextExtractor() {
//String s = get(0).;
StringBuffer sb = new StringBuffer();
return type < TEXT
|| (type == ENTITY_REF && unresolved) ? null : appendCollapseWhiteSpace(sb,get(0)).toString();
}
private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'};
public static final boolean isWhiteSpace(final char ch) {
for (int i=0; i<WHITESPACE.length; i++)
if (ch==WHITESPACE[i]) return true;
return false;
}
static final StringBuffer appendCollapseWhiteSpace(StringBuffer sb, String text) {
final int textLength=text.length();
int i=0;
boolean firstWasWhiteSpace=false;
while (true) {
if (i>=textLength) return sb;
if (!isWhiteSpace(text.charAt(i))) break;
i++;
}
do {
final char ch = text.charAt(i++);
if (isWhiteSpace(ch)) {
firstWasWhiteSpace =true;
} else {
if (firstWasWhiteSpace) {
sb.append(' ');
firstWasWhiteSpace =false;
}
sb.append(ch);
}
} while (i<textLength);
return sb;
}
public int getEventType(){
return type;
}
private final void parseEndTag()
throws IOException{
read(); // '<'
read(); // '/'
name = readName();
//System.out.println("EndTag:"+name);
skip();
read('>');
int sp = (depth - 1) << 2;
if (depth == 0) {
error("element stack empty");
type = COMMENT;
return;
}
if (!name.equals(elementStack[sp + 3])) {
error("expected: /" + elementStack[sp + 3] + " read: " + name);
// become case insensitive in relaxed mode
int probe = sp;
while (probe >= 0 && !name.toLowerCase().equals(elementStack[probe + 3].toLowerCase())) {
stackMismatch++;
probe -= 4;
}
if (probe < 0) {
stackMismatch = 0;
// text = "unexpected end tag ignored";
type = COMMENT;
return;
}
}
namespace = elementStack[sp];
prefix = elementStack[sp + 1];
name = elementStack[sp + 2];
}
private final int parseLegacy(boolean push)
throws IOException{
String req = "";
int term;
int result;
int prev = 0;
read(); // <
int c = read();
if (c == '?') {
if ((peek(0) == 'x' || peek(0) == 'X')
&& (peek(1) == 'm' || peek(1) == 'M')) {
if (push) {
push(peek(0));
push(peek(1));
}
read();
read();
if ((peek(0) == 'l' || peek(0) == 'L') && peek(1) <= ' ') {
if (line != 1 || column > 4)
error("PI must not start with xml");
parseStartTag(true);
if (attributeCount < 1 || !"version".equals(attributes[2]))
error("version expected");
version = attributes[3];
int pos = 1;
if (pos < attributeCount
&& "encoding".equals(attributes[2 + 4])) {
encoding = attributes[3 + 4];
pos++;
}
if (pos < attributeCount
&& "standalone".equals(attributes[4 * pos + 2])) {
String st = attributes[3 + 4 * pos];
if ("yes".equals(st))
standalone = new Boolean(true);
else if ("no".equals(st))
standalone = new Boolean(false);
else
error("illegal standalone value: " + st);
pos++;
}
if (pos != attributeCount)
error("illegal xmldecl");
isWhitespace = true;
txtPos = 0;
return XML_DECL;
}
}
/* int c0 = read ();
int c1 = read ();
int */
term = '?';
result = PROCESSING_INSTRUCTION;
} else if (c == '!') {
if (peek(0) == '-') {
result = COMMENT;
req = "--";
term = '-';
} else if (peek(0) == '[') {
result = CDSECT;
req = "[CDATA[";
term = ']';
push = true;
} else {
result = DOCDECL;
req = "DOCTYPE";
term = -1;
}
} else {
error("illegal: <" + c);
return COMMENT;
}
for (int i = 0; i < req.length(); i++)
read(req.charAt(i));
if (result == DOCDECL)
parseDoctype(push);
else {
while (true) {
c = read();
if (c == -1){
error(UNEXPECTED_EOF);
return COMMENT;
}
if (push)
push(c);
if ((term == '?' || c == term)
&& peek(0) == term
&& peek(1) == '>')
break;
prev = c;
}
if (term == '-' && prev == '-')
error("illegal comment delimiter: --->");
read();
read();
if (push && term != '?')
txtPos--;
}
return result;
}
private final String readName()
throws IOException{
int pos = txtPos;
int c = peek(0);
if ((c < 'a' || c > 'z')
&& (c < 'A' || c > 'Z')
&& c != '_'
&& c != ':'
&& c < 0x0c0
&& !relaxed)
error("name expected");
do {
push(read());
c = peek(0);
}
while ((c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
|| c == '_'
|| c == '-'
|| c == ':'
|| c == '.'
|| c >= 0x0b7);
String result = get(pos);
txtPos = pos;
return result;
}
private final String get(int pos) {
return new String(txtBuf, pos, txtPos - pos);
}
private final void skip() throws IOException {
while (true) {
int c = peek(0);
if (c > ' ' || c == -1)
break;
read();
}
}
private final void parseDoctype(boolean push)
throws IOException{
int nesting = 1;
boolean quoted = false;
// read();
while (true) {
int i = read();
switch (i) {
case -1 :
error(UNEXPECTED_EOF);
return;
case '\'' :
quoted = !quoted;
break;
case '<' :
if (!quoted)
nesting++;
break;
case '>' :
if (!quoted) {
if ((--nesting) == 0)
return;
}
break;
}
if (push)
push(i);
}
}
private final void pushText(int delimiter, boolean resolveEntities)
throws IOException{
int next = peek(0);
int cbrCount = 0;
while (next != -1 && next != delimiter) { // covers eof, '<', '"'
if (delimiter == ' ')
if (next <= ' ' || next == '>')
break;
if (next == '&') {
if (!resolveEntities)
break;
pushEntity();
} else if (next == '\n' && type == START_TAG) {
read();
push(' ');
} else
push(read());
if (next == '>' && cbrCount >= 2 && delimiter != ']')
error("Illegal: ]]>");
if (next == ']')
cbrCount++;
else
cbrCount = 0;
next = peek(0);
}
}
private final void pushEntity()
throws IOException{
push(read()); // &
int pos = txtPos;
while (true) {
int c = read();
if (c == ';')
break;
if (c < 128
&& (c < '0' || c > '9')
&& (c < 'a' || c > 'z')
&& (c < 'A' || c > 'Z')
&& c != '_'
&& c != '-'
&& c != '#') {
if(!relaxed){
error("unterminated entity ref");
}
//; ends with:"+(char)c);
if (c != -1)
push(c);
return;
}
push(c);
}
String code = get(pos);
txtPos = pos - 1;
if (token && type == ENTITY_REF){
name = code;
}
if (code.charAt(0) == '#') {
int c =
(code.charAt(1) == 'x'
? Integer.parseInt(code.substring(2), 16)
: Integer.parseInt(code.substring(1)));
push(c);
return;
}
String result = (String) entityMap.get(code);
unresolved = result == null;
if (unresolved) {
if (!token)
error("unresolved: &" + code + ";");
} else {
for (int i = 0; i < result.length(); i++)
push(result.charAt(i));
}
}
private final void parseStartTag(boolean xmldecl)
throws IOException{
if (!xmldecl)
read();
name = readName();
//System.out.println("StartTag:"+name);
attributeCount = 0;
while (true) {
skip();
int c = peek(0);
if (xmldecl) {
if (c == '?') {
read();
read('>');
return;
}
} else {
if (c == '/') {
degenerated = true;
read();
skip();
read('>');
break;
}
if (c == '>' && !xmldecl) {
read();
break;
}
}
if (c == -1) {
error(UNEXPECTED_EOF);
//type = COMMENT;
return;
}
String attrName = readName();
if (attrName.length() == 0) {
error("attr name expected");
break;
}
int i = (attributeCount++) << 2;
attributes = ensureCapacity(attributes, i + 4);
attributes[i++] = "";
attributes[i++] = null;
attributes[i++] = attrName;
skip();
if (peek(0) != '=') {
error("Attr.value missing f. "+attrName);
attributes[i] = "1";
} else {
read('=');
skip();
int delimiter = peek(0);
if (delimiter != '\'' && delimiter != '"') {
error("attr value delimiter missing!");
delimiter = ' ';
} else
read();
int p = txtPos;
pushText(delimiter, true);
String skdkfk = get(p);
attributes[i] = skdkfk;
System.out.println("attributes:"+skdkfk);
txtPos = p;
if (delimiter != ' ')
read(); // skip endquote
}
}
int sp = depth++ << 2;
elementStack = ensureCapacity(elementStack, sp + 4);
elementStack[sp + 3] = name;
if (depth >= nspCounts.length) {
int[] bigger = new int[depth + 4];
System.arraycopy(nspCounts, 0, bigger, 0, nspCounts.length);
nspCounts = bigger;
}
nspCounts[depth] = nspCounts[depth - 1];
/*
if(!relaxed){
for (int i = attributeCount - 1; i > 0; i--) {
for (int j = 0; j < i; j++) {
if (getAttributeName(i).equals(getAttributeName(j)))
exception("Duplicate Attribute: " + getAttributeName(i));
}
}
}
*/
if (processNsp)
adjustNsp();
else
namespace = "";
elementStack[sp] = namespace;
elementStack[sp + 1] = prefix;
elementStack[sp + 2] = name;
}
private final boolean adjustNsp(){
boolean any = false;
for (int i = 0; i < attributeCount << 2; i += 4) {
// * 4 - 4; i >= 0; i -= 4) {
String attrName = attributes[i + 2];
int cut = attrName.indexOf(':');
String prefix;
if (cut != -1) {
prefix = attrName.substring(0, cut);
attrName = attrName.substring(cut + 1);
} else if (attrName.equals("xmlns")) {
prefix = attrName;
attrName = null;
} else
continue;
if (!prefix.equals("xmlns")) {
any = true;
} else {
int j = (nspCounts[depth]++) << 1;
nspStack = ensureCapacity(nspStack, j + 2);
nspStack[j] = attrName;
nspStack[j + 1] = attributes[i + 3];
if (attrName != null && attributes[i + 3].equals(""))
error("illegal empty namespace");
// prefixMap = new PrefixMap (prefixMap, attrName, attr.getValue ());
//System.out.println (prefixMap);
System.arraycopy(
attributes,
i + 4,
attributes,
i,
((--attributeCount) << 2) - i);
i -= 4;
}
}
if (any) {
for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) {
String attrName = attributes[i + 2];
int cut = attrName.indexOf(':');
if (cut == 0 && !relaxed)
throw new RuntimeException(
"illegal attribute name: " + attrName + " at " + this);
else if (cut != -1) {
String attrPrefix = attrName.substring(0, cut);
attrName = attrName.substring(cut + 1);
String attrNs = getNamespace(attrPrefix);
if (attrNs == null && !relaxed)
throw new RuntimeException(
"Undefined Prefix: " + attrPrefix + " in " + this);
attributes[i] = attrNs;
attributes[i + 1] = attrPrefix;
attributes[i + 2] = attrName;
/*
if (!relaxed) {
for (int j = (attributeCount << 2) - 4; j > i; j -= 4)
if (attrName.equals(attributes[j + 2])
&& attrNs.equals(attributes[j]))
exception(
"Duplicate Attribute: {"
+ attrNs
+ "}"
+ attrName);
}
*/
}
}
}
int cut = name.indexOf(':');
if (cut == 0)
error("illegal tag name: " + name);
if (cut != -1) {
prefix = name.substring(0, cut);
name = name.substring(cut + 1);
}
this.namespace = getNamespace(prefix);
if (this.namespace == null) {
if (prefix != null)
error("undefined prefix: " + prefix);
this.namespace = NO_NAMESPACE;
}
return any;
}
//获取命名空间
public String getNamespace(String prefix) {
if ("xml".equals(prefix))
return "http://www.w3.org/XML/1998/namespace";
if ("xmlns".equals(prefix))
return "http://www.w3.org/2000/xmlns/";
for (int i = (getNamespaceCount(depth) << 1) - 2; i >= 0; i -= 2) {
if (prefix == null) {
if (nspStack[i] == null)
return nspStack[i + 1];
} else if (prefix.equals(nspStack[i]))
return nspStack[i + 1];
}
return null;
}
public int getNamespaceCount(int depth) {
if (depth > this.depth)
throw new IndexOutOfBoundsException();
return nspCounts[depth];
}
private final void read(char c)throws IOException{
int a = read();
if (a != c)
error("expected: '" + c + "' actual: '" + ((char) a) + "'");
}
private final int read() throws IOException {
int result;
if (peekCount == 0)
result = peek(0);
else {
result = peek[0];
peek[0] = peek[1];
}
// else {
// result = peek[0];
// System.arraycopy (peek, 1, peek, 0, peekCount-1);
// }
peekCount--;
column++;
srcLength++;
if (result == '\n') {
line++;
column = 1;
}
return result;
}
private final void push(int c) {
isWhitespace &= c <= ' ';
if (txtPos == txtBuf.length) {
char[] bigger = new char[txtPos * 4 / 3 + 4];
System.arraycopy(txtBuf, 0, bigger, 0, txtPos);
txtBuf = bigger;
}
txtBuf[txtPos++] = (char) c;
}
private final String[] ensureCapacity(String[] arr, int required) {
if (arr.length >= required)
return arr;
String[] bigger = new String[required + 16];
System.arraycopy(arr, 0, bigger, 0, arr.length);
return bigger;
}
//设置编码
public void setInput(InputStream is, String _enc) throws IOException {
srcPos = 0;
srcCount = 0;
String enc = _enc;
if (is == null)
throw new IllegalArgumentException();
try {
if (enc == null) {
// read four bytes
int chk = 0;
while (srcCount < 4) {
int i = is.read();
srcLength++;
if (i == -1)
break;
chk = (chk << 8) | i;
srcBuf[srcCount++] = (char) i;
}
System.out.println(chk);
if (srcCount == 4) {
switch (chk) {
case 0x00000FEFF :
enc = "UTF-32BE";
srcCount = 0;
break;
case 0x0FFFE0000 :
enc = "UTF-32LE";
srcCount = 0;
break;
case 0x03c :
enc = "UTF-32BE";
srcBuf[0] = '<';
srcCount = 1;
break;
case 0x03c000000 :
enc = "UTF-32LE";
srcBuf[0] = '<';
srcCount = 1;
break;
case 0x0003c003f :
enc = "UTF-16BE";
srcBuf[0] = '<';
srcBuf[1] = '?';
srcCount = 2;
break;
// 这是我加上去的----------------------------------
case 0x3c68746d:
//System.out.println("ssdesdfdf");
enc = "gb2312";
srcBuf[0] = '<';
//srcBuf[1] = '?';
srcCount = 1;
break;
case 0xd0a3c3f:
enc = "UTF-8";
srcBuf[0] = '<';
srcBuf[1] = '?';
srcCount = 2;
break;
//-------------------------------------------
case 0x03c003f00 :
enc = "UTF-16LE";
srcBuf[0] = '<';
srcBuf[1] = '!';
srcCount = 2;
break;
case 0xa0a3c21:
enc = "UTF-8";
srcBuf[0] = '<';
srcBuf[1] = '!';
srcCount = 2;
break;
//case 0x03c21444f:
//enc = "gb2312";
//srcBuf[0] = '<';
//srcBuf[1] = '!';
//srcCount = 2;
//break;
case 0x03c3f786d :
while (true) {
int i = is.read();
srcLength++;
if (i == -1)
break;
srcBuf[srcCount++] = (char) i;
if (i == '>') {
String s = new String(srcBuf, 0, srcCount);
int i0 = s.indexOf("encoding");
if (i0 != -1) {
while (s.charAt(i0) != '"'
&& s.charAt(i0) != '\'')
i0++;
char deli = s.charAt(i0++);
int i1 = s.indexOf(deli, i0);
enc = s.substring(i0, i1);
}
if(enc == null)
enc = "UTF-8";
break;
}
}
default :
if ((chk & 0x0ffff0000) == 0x0FEFF0000) {
enc = "UTF-16BE";
srcBuf[0] =
(char) ((srcBuf[2] << 8) | srcBuf[3]);
srcCount = 1;
} else if ((chk & 0x0ffff0000) == 0x0fffe0000) {
enc = "UTF-16LE";
srcBuf[0] =
(char) ((srcBuf[3] << 8) | srcBuf[2]);
srcCount = 1;
} else if ((chk & 0x0ffffff00) == 0x0EFBBBF00) {
enc = "UTF-8";
srcBuf[0] = srcBuf[3];
srcCount = 1;
}
}
}
}
System.out.println(enc);
//if (enc == null)
//enc = "gb2312";
int sc = srcCount;
if (enc == null)
setInput(new InputStreamReader(is));
else
setInput(new InputStreamReader(is, enc));
encoding = _enc;
srcCount = sc;
} catch (Exception e) {
throw new IOException();
}
}
public int next() throws IOException {
txtPos = 0;
isWhitespace = true;
int minType = 9999;
token = false;
do {
nextImpl();
if (type < minType)
minType = type;
// if (curr <= TEXT) type = curr;
}
while (minType > ENTITY_REF // ignorable
|| (minType >= TEXT && peekType() >= TEXT));
type = minType;
if (type > TEXT)
type = TEXT;
return type;
}
public int getLength(){
return srcLength;
}
//获取标签名
public String getTagName(){
return name;
}
//获取标签属性
public String getAttributeValue(String namespace, String name) {
for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) {
if (attributes[i + 2].equals(name)
&& (namespace == null || attributes[i].equals(namespace)))
return attributes[i + 3];
}
return null;
}
}
导读
J2ME Tabbed Menu http://fonter.iteye.com/blog/409982
手机中的重定向问题及处理 http://fonter.iteye.com/blog/400836
J2ME如何通过cmwap直接访问互连网 http://fonter.iteye.com/blog/400868
安装JAD时提示JAD无效原因之一 http://fonter.iteye.com/blog/400888
J2ME网络交互之优化 http://fonter.iteye.com/blog/405137
解决Eclipse无法调试J2ME程序的配置方法 http://fonter.iteye.com/blog/405697
J2ME模拟器加载RMS时突然失效的原因 http://fonter.iteye.com/blog/407576
J2ME飞信协议分析(初稿) http://fonter.iteye.com/blog/408385