import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.beanutils.BeanUtils;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class ParserTest extends DefaultHandler {
StringBuffer tdBuffer= new StringBuffer();
boolean tr_in_stack = false;
static String tag_tr = "tr";
static String tag_td = "td";
TrData trData;
Stack<TrData> trDatas = new Stack<TrData>();
Stack<Object> objects = new Stack<Object>();
Stack<String> tagNames = new Stack<String>();
public void startDocument() throws SAXException {
System.out.println("----- document begin -----");
}
public void endDocument() throws SAXException {
System.out.println("----- document end -----");
}
public void characters(char ch[], int start, int length)
throws SAXException {
// this.objects.isEmpty() ||
if (this.tagNames.isEmpty())
return;
if (this.tagNames.peek().equals(tag_td)) {
String value = new String(ch, start, length);
if (value == null)
return;
value = value.trim();
if ("".equals(value))
return;
tdBuffer.append(value);
}
}
/* add the name of the attribute belonged any object */
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
qName = qName.toLowerCase();
if (!qName.equals(tag_tr) && !qName.equals(tag_td))
return;
if (qName.equals(tag_tr)){
if(tr_in_stack)
this.tagNames.clear();
else
this.trData = new TrData();
tr_in_stack = true;
}
this.tagNames.add(qName);
}
/* remove the name of the attribute belonged any object */
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (tagNames.isEmpty()&&!qName.equals("html"))
return;
qName = qName.toLowerCase();
if (qName.equals(tag_td)){
this.tagNames.pop();
String value = this.tdBuffer.toString();
if(!"".equals(value)||value!=null)
this.trData.addTd(value);
this.tdBuffer = new StringBuffer();
}
if (qName.equals(tag_tr)) {
this.tagNames.clear();
tr_in_stack = false;
this.trDatas.add(this.trData);
}
if(qName.equals("html")){
int ii=0;
List<String> strs = new ArrayList<String>();
String s=null;
while(ii<=trDatas.lastIndexOf(trDatas.lastElement())){
if(!trDatas.elementAt(ii).getTds().get(0).isEmpty()){
s=trDatas.elementAt(ii).getTds().toString();
s=s.substring(1, s.length()-1);
Pattern p = Pattern.compile(" ");
Matcher m = p.matcher(s);
s=m.replaceAll("");
strs.add(s);
}
ii++;
}
for(String str:strs){
System.out.println(strs.indexOf(str)+" "+str);
System.out.println();
}
}
}
protected void setAttributes(Object target, Attributes atts)
throws IllegalAccessException, InvocationTargetException {
/* set the attribute */
for (int i = 0; i < atts.getLength(); i++) {
BeanUtils.setProperty(target, atts.getQName(i), atts.getValue(i));
}
}
private boolean isValid(String value) {
if (value.startsWith("<img") || value.startsWith("<table"))
return false;
return true;
}
public static class TrData {
private List<String> tds = new ArrayList<String>();
public void addTd(String tdMsg) {
tds.add(tdMsg);
}
public List<String> getTds() {
return tds;
}
}
public static void main(String args[]) {
SAXParserFactory saxFactory = SAXParserFactory.newInstance();
InputStream inputStream = null;
Reader reader = null;
try {
/* inputStream of xml */
inputStream = new FileInputStream(
"D:/1.xml");
/* specify the CharSet when reading the Stream */
reader = new InputStreamReader(inputStream, "UTF-8");
/*
* construct the InputSource with reader, which is the source of
* saxParser
*/
InputSource source = new InputSource(reader);
/* create parser with SaxFactory */
SAXParser parser = saxFactory.newSAXParser();
/* the most importence component used to parse the xml */
ParserTest handler = new ParserTest();
/* parse the xml source with studentHandler */
parser.parse(source, handler);
} catch (Exception e) {
e.printStackTrace();
} finally {
/* release the resource */
try {
if (reader != null)
reader.close();
if (inputStream != null)
inputStream.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
}