XULRunner with Java: JavaXPCOM Tutorial 4

最新推荐文章于 2024-09-20 15:44:47 发布

ljl_xyf

最新推荐文章于 2024-09-20 15:44:47 发布

阅读量103

点赞数

分类专栏： java抓取文章标签： java

本文链接：https://blog.csdn.net/ljl_xyf/article/details/84185811

版权

java抓取专栏收录该内容

11 篇文章 0 订阅

订阅专栏

7. 用JavaXPCOM来自动浏览网页
   在这一节里我们将解决一些自动浏览网页中的一些问题，尝试把一下通用的任务抽象成人类可读的方法，因此你可以轻易的阅读

代码并知道它的功能。我们将构建一个Web Browser来加载网页，点击按钮或者超链接，使用XPath来抽取一下有用的信息。在每一个

小节里面，我们都将在我们的浏览器里增加新的功能，因此在最后，我们将有一个Web Browser，它能够实现网页的自动
浏览。

7.1 使用我们的浏览器来加载网页
    在例子 SimpleBrowser里，我们使用方法 public boolean setUrl(String url) 来让浏览器加载一个url。这个方法的问题是它

可以让浏览器开始加载一个页面，但是不会等待浏览器加载页面完成。我们写了一个方法叫做go来实现这个功能，因此后面我们将使

用这种方法来安全的加载一个页面，阻塞住执行流程直到页面加载成功或者超时。
    注：实现的方法是在setUrl的时候定义一个CountDownLatch，然后监听browser.addProgressListener，等页面完成后把这个

CountDownLatch countDown一下。调用setUrl后使用CountDownLatch的await方法等待加载完成或者超时。注意调用setUrl时要启动

一个SWT线程
               display.syncExec(new Runnable() {
            public void run() {
                browser.setUrl(url);
            }
        });
    另外，我们可能还想获得加载后的页面内容，所有译者增加了一个变量content。增加的代码如下
                browser.addProgressListener(new ProgressListener() {
                    public void changed(ProgressEvent event) {
                    }

                    public void completed(ProgressEvent event) {
                        // When a page is loaded, decrement the latch,
                        // which count will be 0 after this call.
                        latch.countDown();
                        content=browser.getText();// added by LiLi
                    }
                });

    package es.ladyr.javaxpcom.browser;  
      
    import java.util.concurrent.CountDownLatch;  
    import java.util.concurrent.TimeUnit;  
    import org.eclipse.swt.SWT;  
    import org.eclipse.swt.browser.ProgressEvent;  
    import org.eclipse.swt.browser.ProgressListener;  
    import org.eclipse.swt.widgets.Display;  
    import org.eclipse.swt.widgets.Shell;  
    import org.eclipse.swt.browser.Browser;  
    import org.eclipse.swt.SWTError;  
      
    public class SimpleBrowserWithGo {  
             
            // We will need SWT display to execute methods  
            // into the SWT event thread.  
             
                    Browser browser;  
            private Display display;  
             
            // Latch used to manage page loading  
            // Uses a count of 1, so when the browser starts loading  
            // a page, we create a new latch, which will be  
            // decremented when the page is loaded.  
            private CountDownLatch latch;  
             
            // Default timeout to 60 seconds  
            private long defaultTimeout = 60000;  
            /** 
             * Creates a web browser which is able to load pages waiting until 
             * the page is completely loaded. 
             * 
             */  
            public SimpleBrowserWithGo (final String xulrunnerPath) {  
                     
                    // Use a latch to wait for the browser initialization.  
                    final CountDownLatch initLatch = new CountDownLatch(1);  
                     
                     
                     
                    // MozillaBrowser needs a window manager to work. We are using SWT  
                    // for the graphical interface, so we need to execute MozillaBrowser  
                    // methods into the SWT event thread. If we were use another thread,  
                    // that methods could not work properly and throw an exception,  
                    // breaking the execution flow and crashing our application.  
                    new Thread("SWT-Event-Thread") {  
                            @Override  
                            public void run() {  
                                             
                                    display = new Display();  
                                    Shell shell = new Shell(display);  
                                    shell.setSize(800, 600);  
                                    shell.open();  
                                    // If you have XULRunner installed, you can call the constructor without  
                                    // the last parameter:  
                                    //  
                                    // final MozillaBrowser browser = new MozillaBrowser(shell,SWT.BORDER);  
                                    //  
                                    // That last parameter is the path for XULRunner files  
                                    // (where you have uncompressed downloaded XULRunner package).  
                                    try {  
                                                    browser = new Browser(shell, SWT.MOZILLA);  
                                            } catch (SWTError e) {  
                                                    System.out.println("Could not instantiate Browser: " + e.getMessage  
    ());  
                                                    return;  
                                            }  
                                     
                                     
                                    // Adapt browser size to shell size  
                                    browser.setBounds(shell.getClientArea());  
                                     
                                    // Listens for page loading status.  
                                    browser.addProgressListener(new ProgressListener() {  
                                            public void changed(ProgressEvent event) {  
                                            }  
                                            public void completed(ProgressEvent event) {  
                                                    // When a page is loaded, decrement the latch,  
                                                    // which count will be 0 after this call.  
                                                    latch.countDown();  
                                            }  
                                    });  
                                     
                                    // Realease the initialization latch, which has value 1,  
                                    // so after this call its value will be 0.  
                                    initLatch.countDown();  
                                     
                                    while (!shell.isDisposed()) {  
                                            if (!display.readAndDispatch()) {  
                                                    display.sleep();  
                                            }  
                                    }  
                                    System.exit(0);  
                            }  
                    }.start();  
                     
                    try {  
                            // Waits until the initialization latch is released.  
                            initLatch.await();  
                    } catch (InterruptedException e) {  
                            Thread.interrupted();  
                    }                
            }  
             
            /** 
             * Loads an URL into the browser and waits until the page is 
             * totally loaded. 
             * @param url 
             * @throws SimpleBrowserException 
             */  
            public void go(final String url) throws SimpleBrowserException {  
                     
                    // Creates a latch with count 1  
                    latch = new CountDownLatch(1);  
                    // Uses the SWT event thread to execute the method to  
                    // load an URL in the browser.  
                    display.syncExec(new Runnable() {  
                            public void run() {  
                                    browser.setUrl(url);  
                            }  
                    });  
                     
                    // Waits for the finish of the page loading, or for a given  
                    // timeout in case that the loading doesn't finish in a  
                    // reasonable time.  
                    boolean timeout = waitLoad(defaultTimeout);  
                    if (timeout) {  
                            throw new SimpleBrowserException("Timeout waiting page loading.");  
                    }  
            }  
             
            private boolean waitLoad(long millis) {  
                    try {  
                            // Uses the latch, created by 'go' method to wait for  
                            // the finish of the page loading (it will occurs when  
                            // our 'progressListener' receives a event for its method  
                            // 'completed'), or for a given timeout in case that the  
                            // loading doesn't finish in a reasonable time.  
                            boolean timeout;  
                            timeout = !latch.await(millis,TimeUnit.MILLISECONDS);  
                            if (timeout) {  
                                    // If the timeout expired, then we will stop  
                                    // page loading.  
                                    display.syncExec(new Runnable() {  
                                            public void run() {  
                                                    browser.stop();  
                                            }  
                                    });  
                                    // Waits for the loading is stopped  
                                    latch.await(millis,TimeUnit.MILLISECONDS);  
                            }  
                            return timeout;  
                    } catch (InterruptedException e) {  
                            throw new Error(e);  
                    }  
            }  
             
            public static void main(String[] args) {  
                    String xulrunnerPath = null;  
                    if ( args.length > 0 ) {  
                            xulrunnerPath = args[0];  
                    }  
                     
                    // Instantiate our simple web browser  
                    SimpleBrowserWithGo simpleBrowser = new SimpleBrowserWithGo(xulrunnerPath);  
             
                    try{  
                            // Use the new functionality to load some URLs  
                            // with our browser.  
                            simpleBrowser.go("http://www.google.com");  
                            Thread.sleep(3000);  
                            simpleBrowser.go("http://www.urjc.es");  
                            Thread.sleep(3000);  
                            simpleBrowser.go("http://www.mozilla.org");  
                            Thread.sleep(3000);  
                             
                    } catch (SimpleBrowserException e) {  
                            System.err.println("Problems calling go method.");  
                            e.printStackTrace();  
                    } catch (InterruptedException e) {  
                            System.err.println("Problems calling sleep.");  
                            e.printStackTrace();  
                            Thread.interrupted();  
                    }  
                                                     
       ime().halt(0);  
                     
            }  
             
    }

7.2 解析XPath来获得W3C Node
一旦我们能够在浏览器里加载一个HTML页面，我们可能想访问DOM节点来抽取信息。前面我们花了一节来吧Mozilla Node转换成

W3C node。现在我们使用那个方法来用标准的方法操作W3C Node。我们实现了一些方法来创建XPath Evaluator和XPath resolver来

抽取节点。当Xpath evaluator返回一个结果，我们把每个返回的DOM node转换成响应的W3C DOM element，使用的方法是 static

Node getNodeInstance( nsIDOMNode node ) 。因此，使用我们的browser可以直接调用下面的方法：

    ...  
    import org.w3c.dom.Node;  
    ...  
    /** 
     * 
     * @param xpath 
     * @return      a list with the nodes corresponding to a given xpath. 
     * @throws SimpleBrowserException 
     */  
    public List<Node> xpathNodes(String xpath) {  
    ...  
    /** 
     * 
     * @param <T> 
     * @param xpath 
     * @param nodeClass 
     * @return      a list of <code>nodeClass</code> nodes corresponding 
     *      to a given xpath. 
     * @throws SimpleBrowserException 
     */  
    public <T extends Node> List<T> xpathNodes(String xpath, Class<T> nodeClass) {  
    ...

下面是完整的例子：

译注：核心代码就是下面这个方法。它有两个参数，xpath和nsIDOMNode,返回的是满足XPath的Node的list。
Node是W3C Node 。其实就是调用xpcom的接口来做xpath解析，然后把nsIDOMNode的List转成W3C Node的list。
不过我感觉其实没有必要，nsIDOMNode的属性更多，而且W3C Node没办法转回去成为nsIDOMNode。在使用XPath的时候，我们可能先

用绝对XPath找到某个table，然后根据相对XPath找tr，td等。第二次调用xPathNodes是的参数 nsIDOMNode context就是第一次返回

的结果里的nsIDOMNode。

private List<Node> xPathNodes(String xpath, nsIDOMNode context) {  
        // Obtain the Mozilla DOM HTML document  
        HTMLDocumentImpl documentImpl = (HTMLDocumentImpl) getW3CDocument();  
        nsIDOMHTMLDocument document = documentImpl.getInstance();  
        // Creates a name space resolver for the document  
        nsIDOMXPathNSResolver res = xpathEval.createNSResolver(document);  
        List<Node> resultNodes = null;  
        // Evaluates given XPath in a given context, using the resolver created  
        // for the current document as an ordered iterator  
        nsISupports obj = xpathEval.evaluate(xpath, context, res,  
                nsIDOMXPathResult.ORDERED_NODE_ITERATOR_TYPE, null);  
        // Obtain the interface corresponding to the XPath XPCOM results object  
        nsIDOMXPathResult result = (nsIDOMXPathResult) obj  
                .queryInterface(nsIDOMXPathResult.NS_IDOMXPATHRESULT_IID);  
        try {  
            // Extract result nodes for the XPath and add them  
            // to a standard List.  
            resultNodes = getNodes(result);  
        } catch (org.mozilla.xpcom.XPCOMException e) {  
            throw e;  
        }  
        return resultNodes;  
    }  
  
package es.ladyr.javaxpcom.browser;  
import java.util.ArrayList;  
import java.util.List;  
import java.util.concurrent.CountDownLatch;  
import java.util.concurrent.TimeUnit;  
import org.eclipse.swt.SWT;  
import org.eclipse.swt.SWTError;  
import org.eclipse.swt.browser.Browser;  
import org.eclipse.swt.browser.ProgressEvent;  
import org.eclipse.swt.browser.ProgressListener;  
import org.eclipse.swt.widgets.Display;  
import org.eclipse.swt.widgets.Shell;  
import org.mozilla.dom.NodeFactory;  
import org.mozilla.dom.html.HTMLDocumentImpl;  
import org.mozilla.interfaces.nsIComponentManager;  
import org.mozilla.interfaces.nsIDOMDocument;  
import org.mozilla.interfaces.nsIDOMHTMLDocument;  
import org.mozilla.interfaces.nsIDOMNode;  
import org.mozilla.interfaces.nsIDOMWindow;  
import org.mozilla.interfaces.nsIDOMXPathEvaluator;  
import org.mozilla.interfaces.nsIDOMXPathNSResolver;  
import org.mozilla.interfaces.nsIDOMXPathResult;  
import org.mozilla.interfaces.nsISupports;  
import org.mozilla.interfaces.nsIWebBrowser;  
import org.mozilla.xpcom.Mozilla;  
import org.w3c.dom.Node;  
import org.w3c.dom.html.HTMLAnchorElement;  
import org.w3c.dom.html.HTMLDocument;  
  
public class SimpleBrowserWithXPath {  
         
        private final static String NS_IDOMXPATHEVALUATOR_CONTRACTID = "@mozilla.org/dom/xpath-evaluator;1";  
         
        private Browser browser;  
         
        // We will need SWT display to execute methods  
        // into the SWT event thread.  
        private Display display;  
         
         
        // Latch used to manage page loading  
        // Uses a count of 1, so when the browser starts loading  
        // a page, we create a new latch, which will be  
        // decremented when the page is loaded.  
        private CountDownLatch latch;  
         
        // Default timeout to 60 seconds  
        private long defaultTimeout = 60000;  
         
        // XPath evaluator  
        private nsIDOMXPathEvaluator xpathEval;  
        /** 
         * Creates a web browser which is able to load pages waiting until 
         * the page is completely loaded and solve xpaths returning 
         * the corresponding nodes. 
         * 
         */  
        public SimpleBrowserWithXPath (final String xulrunnerPath) {  
                 
                // Use a latch to wait for the browser initialization.  
                final CountDownLatch initLatch = new CountDownLatch(1);  
                 
                 
                // MozillaBrowser needs a window manager to work. We are using SWT  
                // for the graphical interface, so we need to execute MozillaBrowser  
                // methods into the SWT event thread. If we were use another thread,  
                // that methods could not work properly and throw an exception,  
                // breaking the execution flow and crashing our application.  
                new Thread("SWT-Event-Thread") {  
                        @Override  
                        public void run() {  
                                        display = new Display();  
                                Shell shell = new Shell(display);  
                                shell.setSize(800, 600);  
                                shell.open();  
                                // If you have XULRunner installed, you can call the constructor without  
                                // the last parameter:  
                                //  
                                // final MozillaBrowser browser = new MozillaBrowser(shell,SWT.BORDER);  
                                //  
                                // That last parameter is the path for XULRunner files  
                                // (where you have uncompressed downloaded XULRunner package).  
                                try {  
                                                browser = new Browser(shell, SWT.MOZILLA);  
                                        } catch (SWTError e) {  
                                                System.out.println("Could not instantiate Browser: " + e.getMessage  
());  
                                                return;  
                                        }  
                                 
                                 
                                // Adapt browser size to shell size  
                                browser.setBounds(shell.getClientArea());  
                                 
                                // Listens for page loading status.  
                                browser.addProgressListener(new ProgressListener() {  
                                        public void changed(ProgressEvent event) {  
                                        }  
                                        public void completed(ProgressEvent event) {  
                                                // When a page is loaded, decrement the latch,  
                                                // which count will be 0 after this call.  
                                                latch.countDown();  
                                        }  
                                });  
                                 
                                // Realease the initialization latch, which has value 1,  
                                // so after this call its value will be 0.  
                                initLatch.countDown();  
                                 
                                while (!shell.isDisposed()) {  
                                        if (!display.readAndDispatch()) {  
                                                display.sleep();  
                                        }  
                                }  
                                System.exit(0);  
                        }  
                }.start();  
                 
                try {  
                        // Waits until the initialization latch is released.  
                        initLatch.await();  
                } catch (InterruptedException e) {  
                        Thread.interrupted();  
                }  
                 
                // Creates the XPath evaluator XPCOM component  
                Mozilla moz = Mozilla.getInstance();  
                nsIComponentManager componentManager = moz.getComponentManager();  
                xpathEval = (nsIDOMXPathEvaluator) componentManager.createInstanceByContractID(  
                                                NS_IDOMXPATHEVALUATOR_CONTRACTID, null,  
nsIDOMXPathEvaluator.NS_IDOMXPATHEVALUATOR_IID);  
                 
        }  
         
        /** 
         * Loads an URL into the browser and waits until the page is 
         * totally loaded. 
         * @param url 
         * @throws SimpleBrowserException 
         */  
        public void go(final String url) throws SimpleBrowserException {  
                 
                // Creates a latch with count 1  
                latch = new CountDownLatch(1);  
                // Uses the SWT event thread to execute the method to  
                // load an URL in the browser.  
                display.syncExec(new Runnable() {  
                        public void run() {  
                                browser.setUrl(url);  
                        }  
                });  
                 
                // Waits for the finish of the page loading, or for a given  
                // timeout in case that the loading doesn't finish in a  
                // reasonable time.  
                boolean timeout = waitLoad(defaultTimeout);  
                if (timeout) {  
                        throw new SimpleBrowserException("Timeout waiting page loading.");  
                }  
        }  
         
        /** 
         * 
         * @return      an W3C HTML Document implementation corresponding to 
         *      the Mozilla DOM HTML document currently loaded in the browser. 
         * @throws SimpleBrowserException 
         */  
        public HTMLDocument getW3CDocument() {  
                 
   //           System.out.println("El browser es " + browser.toString());  
                 
                class DocumentGetter implements Runnable {  
                        private nsIDOMHTMLDocument htmldoc;  
                                public void run(){  
                nsIWebBrowser webBrowser = (nsIWebBrowser)browser.getWebBrowser();  
                        if (webBrowser == null) {  
                                System.out.println("Could not get the nsIWebBrowser from the Browser widget");  
                        }        
         
                        nsIDOMWindow dw = webBrowser.getContentDOMWindow();  
                nsIDOMDocument nsDoc = dw.getDocument();  
                htmldoc = (nsIDOMHTMLDocument) nsDoc  
                                                                .queryInterface  
(nsIDOMHTMLDocument.NS_IDOMHTMLDOCUMENT_IID);  
                 
                }  
                                public nsIDOMHTMLDocument getHtmldoc() {  
                                        // TODO Auto-generated method stub  
                                        return htmldoc;  
                                }}  
                 
                DocumentGetter dg = new DocumentGetter();  
                 
                display.syncExec(dg);  
                 
                 
                 
                return HTMLDocumentImpl.getDOMInstance(dg.getHtmldoc());  
        }  
         
        /** 
         * 
         * @param xpath 
         * @return      a list with the nodes corresponding to a given xpath. 
         * @throws SimpleBrowserException 
         */  
        public List<Node> xpathNodes(String xpath) {  
                return xPathNodes(xpath,  
                                ((HTMLDocumentImpl) getW3CDocument()).getInstance());            
        }  
         
        /** 
         * 
         * @param <T> 
         * @param xpath 
         * @param nodeClass 
         * @return      a list of <code>nodeClass</code> nodes corresponding 
         *      to a given xpath. 
         * @throws SimpleBrowserException 
         */  
        public <T extends Node> List<T> xpathNodes(String xpath, Class<T> nodeClass) {  
                return (List<T>)xPathNodes(xpath,  
                                ((HTMLDocumentImpl) getW3CDocument()).getInstance());  
        }        
         
        private boolean waitLoad(long millis) {  
                try {  
                        // Uses the latch, created by 'go' method to wait for  
                        // the finish of the page loading (it will occurs when  
                        // our 'progressListener' receives a event for its method  
                        // 'completed'), or for a given timeout in case that the  
                        // loading doesn't finish in a reasonable time.  
                        boolean timeout;  
                        timeout = !latch.await(millis,TimeUnit.MILLISECONDS);  
                        if (timeout) {  
                                // If the timeout expired, then we will stop  
                                // page loading.  
                                display.syncExec(new Runnable() {  
                                        public void run() {  
                                                browser.stop();  
                                        }  
                                });  
                                // Waits for the loading is stopped  
                                latch.await(millis,TimeUnit.MILLISECONDS);  
                        }  
                        return timeout;  
                } catch (InterruptedException e) {  
                        throw new Error(e);  
                }  
        }  
         
        private List<Node> xPathNodes(String xpath, nsIDOMNode context) {  
                 
                // Obtain the Mozilla DOM HTML document  
                HTMLDocumentImpl documentImpl = (HTMLDocumentImpl) getW3CDocument();  
                nsIDOMHTMLDocument document =  documentImpl.getInstance();  
                 
                // Creates a name space resolver for the document  
                nsIDOMXPathNSResolver res = xpathEval.createNSResolver(document);  
                 
                List<Node> resultNodes = null;  
                 
                // Evaluates given XPath in a given context, using the resolver created  
                // for the current document as an ordered iterator  
                nsISupports obj = xpathEval.evaluate(xpath, context, res,  
                                nsIDOMXPathResult.ORDERED_NODE_ITERATOR_TYPE, null);  
                // Obtain the interface corresponding to the XPath XPCOM results object  
                nsIDOMXPathResult result = (nsIDOMXPathResult) obj.queryInterface(  
                                nsIDOMXPathResult.NS_IDOMXPATHRESULT_IID);  
                 
                try {  
                        // Extract result nodes for the XPath and add them  
                        // to a standard List.  
                        resultNodes = getNodes(result);  
                } catch(org.mozilla.xpcom.XPCOMException e){  
                        throw e;  
                }                        
                                 
                return resultNodes;  
        }  
         
        private <T> List<T> getNodes(nsIDOMXPathResult result) {  
                List<T> nodes = new ArrayList<T>();  
                 
                nsIDOMNode node;  
                while((node = result.iterateNext()) != null){  
                        // Use the functionality provided by the mozdom4java  
                        // (in our case, patched) library to obtain the corresponding  
                        // W3C implementation of a node.  
                        nodes.add((T)NodeFactory.getNodeInstance(node));  
                }  
                return nodes;  
        }  
         
        public static void main(String[] args) {  
                String xulrunnerPath = null;  
                if ( args.length > 0 ) {  
                        xulrunnerPath = args[0];  
                }  
                 
                // Instantiate our simple web browser  
                SimpleBrowserWithXPath simpleBrowser = new SimpleBrowserWithXPath(xulrunnerPath);  
         
                try{  
                        // Load a page in the browser  
                        simpleBrowser.go("http://www.google.com");  
                         
                        // Obtain a list of nodes, without a concrete class,  
                        // because the XPath may return nodes of different  
                        // types, so we work with them in a generic way.  
                        List<Node> nodes = simpleBrowser.xpathNodes("//*");  
                        for (Node node: nodes) {  
                                System.out.println("Node Type: " + node.getNodeName()  
                                                + " -- Content: " + node.getTextContent());  
                        }  
                         
                        // Obtain a list of HTMLAnchorElements, because  
                        // we can be sure about the result of our XPath,  
                        // if it has any result, will be only of  
                        // HTMLAnchorElement type.  
                        for (HTMLAnchorElement a: simpleBrowser.xpathNodes(  
                                        "//a", HTMLAnchorElement.class)) {  
                                System.out.println("Anchor: " + a.getHref());  
                        }  
                         
                } catch (SimpleBrowserException e) {  
                        System.err.println("Problems calling go method.");  
                        e.printStackTrace();  
                }  
                                                 
       
          Runtime.getRuntime().halt(0);  
                 
        }  
         
}