HTMLPage类:
HTMLPage类中主要也就几种用途,而从HTMLPage类中抓取图片是一个非常重要的一个功能,诚然还有超链接和表单。而在HTMLPage类的内置类Parser中,大部分工作都是由handleSimpleTag(简单标签)和和handleStartTag(起始标签方法来完成。
详细代码清单如下:
- package com.heaton.bot;
- import java.util.*;
- import com.heaton.bot.*;
- import java.net.*;
- import java.io.*;
- import javax.swing.text.*;
- import javax.swing.text.html.*;
- /**
- * The HTMLPage class is used to parse an HTML page and store
- * that page, in a parsed form, in memory.
- * are exchanged with a webserver.
- */
- public class HTMLPage {
- /**
- * A list of images on this page.
- */
- protected Vector images = new Vector();
- /**
- * A list of links on this page.
- */
- protected Vector links = new Vector();
- /**
- * A list of forms on this page.
- */
- protected Vector forms = new Vector();
- /**
- * The underlying HTTP object for this page.
- */
- protected HTTP http;
- /**
- * The base URL to resolve relative URL's.
- */
- protected String base;
- /**
- * Construct an HTMLPage object.
- *
- * @param http The HTTP object(or subclass) to use to
- * download pages.
- */
- public HTMLPage(HTTP http)
- {
- this.http = http;
- }
- /**
- * Called to open a page and read it in. If null
- * is specified for the callback(回调), then the other
- * methods in this class may be used to look at
- * images, links and forms.
- * open是进入HTMLPage类的入口点。
- * @param url The URL to read.
- * @param callback A callback class to handle the parse, or null
- * to use the built in one.
- * @exception java.io.IOException
- * @exception javax.swing.text.BadLocationException
- */
- public void open(String url,
- HTMLEditorKit.ParserCallback callback)
- throws IOException,BadLocationException
- {
- http.send(url,null);
- base = url;
- processPage(callback);
- }
- /**
- * Internal function called to start the parse.
- *
- * @param callback The callback object to use.
- * @exception java.io.IOException
- */
- protected void processPage(HTMLEditorKit.ParserCallback callback)
- throws IOException
- {
- /*
- * 创建一个字符串阅读器。
- */
- StringReader r = new StringReader(http.getBody());
- /*
- * 创建新的解析器。
- */
- HTMLEditorKit.Parser parse = new HTMLParse().getParser();
- /*
- * 程序检查是否提供了定制的回调类。如果提供了回调类,则
- * 使用该会掉泪,从而结束了HTMLPage类的工作。如果没有提
- * 供回调类,HTMLPage则使用内置的回调类,该回调类为Parser。
- */
- if ( callback==null ) {
- HTMLPage.Parser p=new HTMLPage.Parser();
- parse.parse(r,p,true);
- } else
- parse.parse(r,callback,false);
- }
- /**
- * Get the underlying HTTP object that was
- * sent to the constructor.
- *
- * @return The underlying HTTP object.
- */
- public HTTP getHTTP()
- {
- return http;
- }
- /**
- * Get a list of all of the links from this page.
- * If this is to be used then null must have been
- * passed as the callback object to the open method.
- *
- * @return All links on this page.
- */
- public Vector getLinks()
- {
- return links;
- }
- /**
- * Get a list of all of the images from this page.
- * If this is to be used then null must have been
- * passed as the callback object to the open method.
- *
- * @return A list of all of the images on this page.
- */
- public Vector getImages()
- {
- return images;
- }
- /**
- * Get a list of all of the forms from this page.
- * If this is to be used then null must have been
- * passed as the callback object to the open method.
- *
- * @return A list of forms.
- */
- public Vector getForms()
- {
- return forms;
- }
- /**
- * Called to perform a post for the specified form.
- *
- * @param form The form object to post.
- * @exception java.io.IOException
- */
- public void post(HTMLForm form)
- throws IOException
- {
- http.getClientHeaders().set("Content-Type",
- "application/x-www-form-urlencoded");
- http.send(form.getAction(),form.toString());
- processPage(null);
- }
- /**
- * Get the URL that is represented by this page.
- *
- * @return The URL that is represented by this page.
- */
- public String getURL()
- {
- return http.getURL();
- }
- /**
- * Called internally to add an image to the list.
- *
- * @param img The image to add.
- */
- protected void addImage(String img)
- {
- img = URLUtility.resolveBase(base,img);
- for ( int i=0;i<images.size();i++ ) {
- String s = (String)images.elementAt(i);
- if ( s.equalsIgnoreCase(img) )
- return;
- }
- images.addElement(img);
- }
- /**
- * A HTML parser callback used by this class to
- * detect links, images and forms.
- * 定义一个名为Parser的内部类,该类实现一个专门的解析器回
- * 调函数,用来跟踪超链接,图像和表单。
- */
- protected class Parser
- extends HTMLEditorKit.ParserCallback {
- /**
- * Used to build up data for an HTML form.
- */
- protected HTMLForm tempForm;
- /**
- * Used to build up options for an HTML form.
- */
- protected AttributeList tempOptions;
- /**
- * Used to build up options for an HTML form.
- */
- protected Attribute tempElement = new Attribute();
- /**
- * Holds the prompt text(just before or after a control.
- */
- protected String tempPrompt = "";
- /**
- * Holds the link till the end link is found
- */
- protected Link tempLink;
- /**
- * Called to handle comments.
- *
- * @param data The comment.
- * @param pos The position.
- */
- public void handleComment(char[] data,int pos)
- {
- }
- /**
- * Called to handle an ending tag.
- *
- * @param t The ending tag.
- * @param pos The position.
- */
- public void handleEndTag(HTML.Tag t,int pos)
- {
- if ( t==HTML.Tag.OPTION ) {
- if ( tempElement!=null ) {
- tempElement.setName(tempPrompt);
- tempOptions.add(tempElement);
- tempPrompt = "";
- }
- tempElement = null;
- } else if ( t==HTML.Tag.FORM ) {
- if ( tempForm!=null )
- forms.addElement(tempForm);
- tempPrompt = "";
- } else if ( t==HTML.Tag.A ) {
- if ( tempLink!=null )
- tempLink.setPrompt(tempPrompt);
- tempPrompt = "";
- }
- }
- /**
- * Called to handle an error. Not used.
- *
- * @param errorMsg The error.
- * @param pos The position.
- */
- public void handleError(String errorMsg,int pos)
- {
- }
- /**
- * Called to handle a simple tag.
- *
- * @param t The simple tag.
- * @param a The attribute list.
- * @param pos The position.
- */
- public void handleSimpleTag(HTML.Tag t,
- MutableAttributeSet a,int pos)
- {
- handleStartTag(t,a,pos);
- }
- /**
- * Called to handle a starting tag.
- *
- * @param t The starting tag.
- * @param a The attribute list.
- * @param pos The position.
- */
- public void handleStartTag(HTML.Tag t,
- MutableAttributeSet a,int pos)
- {
- String type = "";
- // is it some sort of a link
- String href = (String)a.getAttribute(HTML.Attribute.HREF);
- if ( (href!=null) && (t!=HTML.Tag.BASE) ) {
- String alt = (String)a.getAttribute(HTML.Attribute.ALT);
- Link link = new Link(
- alt,
- URLUtility.resolveBase(base,href),
- null);
- links.addElement(tempLink=link);
- } else if ( t==HTML.Tag.OPTION ) {
- tempElement = new Attribute();
- tempElement.setName("");
- tempElement.setValue((String)a.getAttribute(HTML.Attribute.VALUE));
- } else if ( t==HTML.Tag.SELECT ) {
- if ( tempForm==null )
- return;
- tempOptions = new AttributeList();
- tempForm.addInput(
- (String)a.getAttribute(HTML.Attribute.NAME),
- null,
- "select",
- tempPrompt,
- tempOptions);
- tempPrompt = "";
- } else if ( t==HTML.Tag.TEXTAREA ) {
- if ( tempForm==null )
- return;
- tempForm.addInput(
- (String)a.getAttribute(HTML.Attribute.NAME),
- null,
- "textarea",
- tempPrompt,
- null);
- tempPrompt = "";
- }
- else if ( t==HTML.Tag.FORM ) {
- if ( tempForm!=null )
- forms.addElement(tempForm);
- String action =
- (String)a.getAttribute(HTML.Attribute.ACTION);
- if ( action!=null ) {
- try {
- URL aurl = new URL(new URL(http.getURL()),action);
- action = aurl.toString();
- } catch ( MalformedURLException e ) {
- action = null;
- }
- }
- tempForm = new HTMLForm(
- (String)a.getAttribute(HTML.Attribute.METHOD),
- action );
- tempPrompt = "";
- } else if ( t==HTML.Tag.INPUT ) {
- if ( tempForm==null )
- return;
- if ( t!=HTML.Tag.INPUT ) {
- type = (String)a.getAttribute(HTML.Attribute.TYPE);
- if ( type==null )
- return;
- } else
- type = "select";
- if ( type.equalsIgnoreCase("text") ||
- type.equalsIgnoreCase("edit") ||
- type.equalsIgnoreCase("password") ||
- type.equalsIgnoreCase("select") ||
- type.equalsIgnoreCase("hidden") ) {
- tempForm.addInput(
- (String)a.getAttribute(HTML.Attribute.NAME),
- (String)a.getAttribute(HTML.Attribute.VALUE),
- type,
- tempPrompt,
- null);
- tempOptions = new AttributeList();
- }
- } else if ( t==HTML.Tag.BASE ) {
- href = (String)a.getAttribute(HTML.Attribute.HREF);
- if ( href!=null )
- base = href;
- } else if ( t==HTML.Tag.IMG ) {
- String src = (String)a.getAttribute(HTML.Attribute.SRC);
- if ( src!=null )
- addImage(src);
- }
- }
- /**
- * Called to handle text.
- *
- * @param data The text.
- * @param pos The position.
- */
- public void handleText(char[] data,int pos)
- {
- tempPrompt += new String(data) + " ";
- }
- }
- }