对html文件进行扫描,将html元素抽象出来形成树。
- package Source;
- import java.io.*;
- public class HTML2Tree
- {
- //构造方法
- public HTML2Tree()
- {
- tree = new HTree();
- myStack = new Stack();
- }
- //主要处理方法,将一个html文件转换成一个树
- public void main(String filename)
- {
- //html文件
- File f = new File(filename);
- String s = "";
- //标志类型
- String tag = "";
- //读取的当前位置
- int pos = 0;
- //开始位置
- int start = 0;
- int num = 0;
- int status = 0;
- boolean getNewLine = true;
- //html为根节点
- insert("html");
- try
- {
- BufferedReader fis = new BufferedReader(new FileReader(f));
- do
- {
- //获取新的一行并初始化要处理的字符串
- if(getNewLine)
- {
- s = fis.readLine();
- if(s == null) break;
- num++;
- if(s.equalsIgnoreCase("")) continue;
- s = s.toLowerCase();
- pos = 0;
- }
- //状态0,还在html层,并一直读至body块为止
- if(status == 0)
- {
- pos = s.indexOf(");
- if(pos < 0) getNewLine = true;
- else
- {
- //body入栈
- insert("body");
- status = 1;
- pos++;
- getNewLine = false;
- }
- continue;
- }
- //状态1,已进入body部分
- if(status == 1)
- {
- //一直读到有<号位置
- pos = s.indexOf("<", pos);
- if(pos < 0)
- {
- getNewLine = true;
- pos = 0;
- continue;
- }
- //获取标志类型
- tag = getTag(s, pos + 1);
- if(tag != null)
- {
- //是否标志结束
- if(isEndTag(tag))
- {
- if(mayIgnor(tag))
- {
- //状态2,在标签内
- status = 2;
- getNewLine = false;
- continue;
- }
- //根栈顶元素匹配
- if(match(tag))
- {
- myStack.pop();
- //处理完成
- if(myStack.empty()) break;
- pos++;
- getNewLine = false;
- //状态4,读取标签间内容
- status = 4;
- //继续读下一个标志
- start = s.indexOf(">", pos) + 1;
- pos = start;
- }
- else
- {
- //状态2,在标签内
- status = 2;
- getNewLine = false;
- }
- }
- else if(mayIgnor(tag))
- {
- //状态2,在标签内
- status = 2;
- getNewLine = false;
- }
- else
- {
- getNewLine = false;
- if(!isJump(tag))
- {
- //状态2,在标签内
- status = 2;
- //标记入栈
- insert(tag);
- }
- //状态3,结束标签
- else status = 3;
- }
- }
- else getNewLine = true;
- }
- //状态2,在标签内
- else if(status == 2)
- {
- //一直读至结束
- start = s.indexOf(">", pos);
- //判断标志是否跨行
- if(start < 0)
- {
- getNewLine = true;
- pos = 0;
- }
- else
- {
- start++;
- status = 4;
- getNewLine = false;
- }
- }
- //状态3,结束标签
- else if(status == 3)
- {
- pos = s.indexOf((new StringBuilder("/")).append(tag).toString(), pos);
- if(pos < 0)
- {
- getNewLine = true;
- pos = 0;
- }
- else
- {
- pos = s.indexOf(">", pos);
- start = ++pos;
- status = 4;
- getNewLine = false;
- }
- }
- //状态4,获取标签之间的内容
- else if(status == 4)
- {
- int end = s.indexOf("<", start);
- if(end < 0)
- {
- String content = s.substring(start);
- if(!content.trim().equalsIgnoreCase(""))
- ((Node)myStack.getTop()).addContent(content);
- getNewLine = true;
- start = 0;
- }
- else
- {
- String content = s.substring(start, end);
- content = remove(content);
- if(!content.trim().equalsIgnoreCase(""))
- ((Node)myStack.getTop()).addContent(content);
- status = 1;
- pos = end;
- getNewLine = false;
- }
- }
- } while(true);
- }
- catch(IOException e)
- {
- System.out.println(e);
- }
- }
- //获取标签类型
- private String getTag(String line, int pos)
- {
- int end1 = line.indexOf(">", pos);
- int end2 = line.indexOf(" ", pos);
- if(end1 < 0 && end2 >= 0)
- return line.substring(pos, end2);
- if(end1 > 0 && end2 >= 0 && end2 < end1)
- return line.substring(pos, end2);
- if(end2 < 0 && end1 >= 0)
- return line.substring(pos, end1);
- if(end1 < 0 && end2 < 0)
- return null;
- if(end1 >= 0 && end2 >= 0 && end1 < end2)
- return line.substring(pos, end1);
- else
- return line.substring(pos);
- }
- //是否为可跳过字符
- private boolean isJump(String tag)
- {
- int size = Symbol.jump.length;
- for(int i = 0; i < size; i++)
- if(tag.equalsIgnoreCase(Symbol.jump[i]))
- return true;
- return false;
- }
- //是否结束标签
- private boolean isEndTag(String tag)
- {
- int pos = tag.indexOf("/");
- return pos >= 0;
- }
- //是否跟栈顶标签匹配
- private boolean match(String tag)
- {
- Node node = (Node)myStack.getTop();
- String str = node.tag;
- int pos = tag.indexOf(str);
- return pos >= 0;
- }
- //是否为可忽略字符
- private boolean mayIgnor(String tag)
- {
- int i = 0;
- for(int size = Symbol.ignore.length; i < size; i++)
- {
- boolean res = tag.equalsIgnoreCase(Symbol.ignore[i]);
- if(res) return true;
- int r = tag.indexOf("!");
- if(r >= 0) return true;
- }
- return false;
- }
- //删除特殊字符
- public String remove(String str)
- {
- int len = Symbol.remove.length;
- for(int i = 0; i < len; i++)
- {
- String s = Symbol.remove[i];
- str = str.replaceAll(s, "");
- }
- return str;
- }
- //将树上节点插入栈中
- private void insert(String tag)
- {
- Node node;
- if(myStack.empty()) node = new Node("", tag, null);
- else node = new Node("", tag, (Node)myStack.getTop());
- myStack.push(node);
- tree.insert(node);
- }
- //返回整棵树
- public HTree getTree()
- {
- return tree;
- }
- private Stack myStack;
- private HTree tree;
- }