XULRunner with Java: JavaXPCOM Tutorial 4

最新推荐文章于 2024-07-30 20:50:32 发布

fancyerII

最新推荐文章于 2024-07-30 20:50:32 发布

阅读量2.2k

点赞数

分类专栏：搜索文章标签： java browser mozilla initialization string swt

搜索专栏收录该内容

13 篇文章 0 订阅

订阅专栏

7. 用JavaXPCOM来自动浏览网页
   在这一节里我们将解决一些自动浏览网页中的一些问题，尝试把一下通用的任务抽象成人类可读的方法，因此你可以轻易的阅读

代码并知道它的功能。我们将构建一个Web Browser来加载网页，点击按钮或者超链接，使用XPath来抽取一下有用的信息。在每一个

小节里面，我们都将在我们的浏览器里增加新的功能，因此在最后，我们将有一个Web Browser，它能够实现网页的自动
浏览。

7.1 使用我们的浏览器来加载网页
    在例子 SimpleBrowser里，我们使用方法 public boolean setUrl(String url) 来让浏览器加载一个url。这个方法的问题是它

可以让浏览器开始加载一个页面，但是不会等待浏览器加载页面完成。我们写了一个方法叫做go来实现这个功能，因此后面我们将使

用这种方法来安全的加载一个页面，阻塞住执行流程直到页面加载成功或者超时。
    注：实现的方法是在setUrl的时候定义一个CountDownLatch，然后监听browser.addProgressListener，等页面完成后把这个

CountDownLatch countDown一下。调用setUrl后使用CountDownLatch的await方法等待加载完成或者超时。注意调用setUrl时要启动

一个SWT线程
               display.syncExec(new Runnable() {
            public void run() {
                browser.setUrl(url);
            }
        });
    另外，我们可能还想获得加载后的页面内容，所有译者增加了一个变量content。增加的代码如下
                browser.addProgressListener(new ProgressListener() {
                    public void changed(ProgressEvent event) {
                    }

                    public void completed(ProgressEvent event) {
                        // When a page is loaded, decrement the latch,
                        // which count will be 0 after this call.
                        latch.countDown();
                        content=browser.getText();// added by LiLi
                    }
                });

package es.ladyr.javaxpcom.browser; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import org.eclipse.swt.SWT; import org.eclipse.swt.browser.ProgressEvent; import org.eclipse.swt.browser.ProgressListener; import org.eclipse.swt.widgets.Display; import org.eclipse.swt.widgets.Shell; import org.eclipse.swt.browser.Browser; import org.eclipse.swt.SWTError; public class SimpleBrowserWithGo { // We will need SWT display to execute methods // into the SWT event thread. Browser browser; private Display display; // Latch used to manage page loading // Uses a count of 1, so when the browser starts loading // a page, we create a new latch, which will be // decremented when the page is loaded. private CountDownLatch latch; // Default timeout to 60 seconds private long defaultTimeout = 60000; /** * Creates a web browser which is able to load pages waiting until * the page is completely loaded. * */ public SimpleBrowserWithGo (final String xulrunnerPath) { // Use a latch to wait for the browser initialization. final CountDownLatch initLatch = new CountDownLatch(1); // MozillaBrowser needs a window manager to work. We are using SWT // for the graphical interface, so we need to execute MozillaBrowser // methods into the SWT event thread. If we were use another thread, // that methods could not work properly and throw an exception, // breaking the execution flow and crashing our application. new Thread("SWT-Event-Thread") { @Override public void run() { display = new Display(); Shell shell = new Shell(display); shell.setSize(800, 600); shell.open(); // If you have XULRunner installed, you can call the constructor without // the last parameter: // // final MozillaBrowser browser = new MozillaBrowser(shell,SWT.BORDER); // // That last parameter is the path for XULRunner files // (where you have uncompressed downloaded XULRunner package). try { browser = new Browser(shell, SWT.MOZILLA); } catch (SWTError e) { System.out.println("Could not instantiate Browser: " + e.getMessage ()); return; } // Adapt browser size to shell size browser.setBounds(shell.getClientArea()); // Listens for page loading status. browser.addProgressListener(new ProgressListener() { public void changed(ProgressEvent event) { } public void completed(ProgressEvent event) { // When a page is loaded, decrement the latch, // which count will be 0 after this call. latch.countDown(); } }); // Realease the initialization latch, which has value 1, // so after this call its value will be 0. initLatch.countDown(); while (!shell.isDisposed()) { if (!display.readAndDispatch()) { display.sleep(); } } System.exit(0); } }.start(); try { // Waits until the initialization latch is released. initLatch.await(); } catch (InterruptedException e) { Thread.interrupted(); } } /** * Loads an URL into the browser and waits until the page is * totally loaded. * @param url * @throws SimpleBrowserException */ public void go(final String url) throws SimpleBrowserException { // Creates a latch with count 1 latch = new CountDownLatch(1); // Uses the SWT event thread to execute the method to // load an URL in the browser. display.syncExec(new Runnable() { public void run() { browser.setUrl(url); } }); // Waits for the finish of the page loading, or for a given // timeout in case that the loading doesn't finish in a // reasonable time. boolean timeout = waitLoad(defaultTimeout); if (timeout) { throw new SimpleBrowserException("Timeout waiting page loading."); } } private boolean waitLoad(long millis) { try { // Uses the latch, created by 'go' method to wait for // the finish of the page loading (it will occurs when // our 'progressListener' receives a event for its method // 'completed'), or for a given timeout in case that the // loading doesn't finish in a reasonable time. boolean timeout; timeout = !latch.await(millis,TimeUnit.MILLISECONDS); if (timeout) { // If the timeout expired, then we will stop // page loading. display.syncExec(new Runnable() { public void run() { browser.stop(); } }); // Waits for the loading is stopped latch.await(millis,TimeUnit.MILLISECONDS); } return timeout; } catch (InterruptedException e) { throw new Error(e); } } public static void main(String[] args) { String xulrunnerPath = null; if ( args.length > 0 ) { xulrunnerPath = args[0]; } // Instantiate our simple web browser SimpleBrowserWithGo simpleBrowser = new SimpleBrowserWithGo(xulrunnerPath); try{ // Use the new functionality to load some URLs // with our browser. simpleBrowser.go("http://www.google.com"); Thread.sleep(3000); simpleBrowser.go("http://www.urjc.es"); Thread.sleep(3000); simpleBrowser.go("http://www.mozilla.org"); Thread.sleep(3000); } catch (SimpleBrowserException e) { System.err.println("Problems calling go method."); e.printStackTrace(); } catch (InterruptedException e) { System.err.println("Problems calling sleep."); e.printStackTrace(); Thread.interrupted(); } ime().halt(0); } }

7.2 解析XPath来获得W3C Node
    一旦我们能够在浏览器里加载一个HTML页面，我们可能想访问DOM节点来抽取信息。前面我们花了一节来吧Mozilla Node转换成

W3C node。现在我们使用那个方法来用标准的方法操作W3C Node。我们实现了一些方法来创建XPath Evaluator和XPath resolver来

抽取节点。当Xpath evaluator返回一个结果，我们把每个返回的DOM node转换成响应的W3C DOM element，使用的方法是 static

Node getNodeInstance( nsIDOMNode node ) 。因此，使用我们的browser可以直接调用下面的方法：

... import org.w3c.dom.Node; ... /** * * @param xpath * @return a list with the nodes corresponding to a given xpath. * @throws SimpleBrowserException */ public List<Node> xpathNodes(String xpath) { ... /** * * @param <T> * @param xpath * @param nodeClass * @return a list of <code>nodeClass</code> nodes corresponding * to a given xpath. * @throws SimpleBrowserException */ public <T extends Node> List<T> xpathNodes(String xpath, Class<T> nodeClass) { ...

下面是完整的例子：

译注：核心代码就是下面这个方法。它有两个参数，xpath和nsIDOMNode,返回的是满足XPath的Node的list。
Node是W3C Node 。其实就是调用xpcom的接口来做xpath解析，然后把nsIDOMNode的List转成W3C Node的list。
不过我感觉其实没有必要，nsIDOMNode的属性更多，而且W3C Node没办法转回去成为nsIDOMNode。在使用XPath的时候，我们可能先

用绝对XPath找到某个table，然后根据相对XPath找tr，td等。第二次调用xPathNodes是的参数 nsIDOMNode context就是第一次返回

的结果里的nsIDOMNode。

private List<Node> xPathNodes(String xpath, nsIDOMNode context) { // Obtain the Mozilla DOM HTML document HTMLDocumentImpl documentImpl = (HTMLDocumentImpl) getW3CDocument(); nsIDOMHTMLDocument document = documentImpl.getInstance(); // Creates a name space resolver for the document nsIDOMXPathNSResolver res = xpathEval.createNSResolver(document); List<Node> resultNodes = null; // Evaluates given XPath in a given context, using the resolver created // for the current document as an ordered iterator nsISupports obj = xpathEval.evaluate(xpath, context, res, nsIDOMXPathResult.ORDERED_NODE_ITERATOR_TYPE, null); // Obtain the interface corresponding to the XPath XPCOM results object nsIDOMXPathResult result = (nsIDOMXPathResult) obj .queryInterface(nsIDOMXPathResult.NS_IDOMXPATHRESULT_IID); try { // Extract result nodes for the XPath and add them // to a standard List. resultNodes = getNodes(result); } catch (org.mozilla.xpcom.XPCOMException e) { throw e; } return resultNodes; } package es.ladyr.javaxpcom.browser; import java.util.ArrayList; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import org.eclipse.swt.SWT; import org.eclipse.swt.SWTError; import org.eclipse.swt.browser.Browser; import org.eclipse.swt.browser.ProgressEvent; import org.eclipse.swt.browser.ProgressListener; import org.eclipse.swt.widgets.Display; import org.eclipse.swt.widgets.Shell; import org.mozilla.dom.NodeFactory; import org.mozilla.dom.html.HTMLDocumentImpl; import org.mozilla.interfaces.nsIComponentManager; import org.mozilla.interfaces.nsIDOMDocument; import org.mozilla.interfaces.nsIDOMHTMLDocument; import org.mozilla.interfaces.nsIDOMNode; import org.mozilla.interfaces.nsIDOMWindow; import org.mozilla.interfaces.nsIDOMXPathEvaluator; import org.mozilla.interfaces.nsIDOMXPathNSResolver; import org.mozilla.interfaces.nsIDOMXPathResult; import org.mozilla.interfaces.nsISupports; import org.mozilla.interfaces.nsIWebBrowser; import org.mozilla.xpcom.Mozilla; import org.w3c.dom.Node; import org.w3c.dom.html.HTMLAnchorElement; import org.w3c.dom.html.HTMLDocument; public class SimpleBrowserWithXPath { private final static String NS_IDOMXPATHEVALUATOR_CONTRACTID = "@mozilla.org/dom/xpath-evaluator;1"; private Browser browser; // We will need SWT display to execute methods // into the SWT event thread. private Display display; // Latch used to manage page loading // Uses a count of 1, so when the browser starts loading // a page, we create a new latch, which will be // decremented when the page is loaded. private CountDownLatch latch; // Default timeout to 60 seconds private long defaultTimeout = 60000; // XPath evaluator private nsIDOMXPathEvaluator xpathEval; /** * Creates a web browser which is able to load pages waiting until * the page is completely loaded and solve xpaths returning * the corresponding nodes. * */ public SimpleBrowserWithXPath (final String xulrunnerPath) { // Use a latch to wait for the browser initialization. final CountDownLatch initLatch = new CountDownLatch(1); // MozillaBrowser needs a window manager to work. We are using SWT // for the graphical interface, so we need to execute MozillaBrowser // methods into the SWT event thread. If we were use another thread, // that methods could not work properly and throw an exception, // breaking the execution flow and crashing our application. new Thread("SWT-Event-Thread") { @Override public void run() { display = new Display(); Shell shell = new Shell(display); shell.setSize(800, 600); shell.open(); // If you have XULRunner installed, you can call the constructor without // the last parameter: // // final MozillaBrowser browser = new MozillaBrowser(shell,SWT.BORDER); // // That last parameter is the path for XULRunner files // (where you have uncompressed downloaded XULRunner package). try { browser = new Browser(shell, SWT.MOZILLA); } catch (SWTError e) { System.out.println("Could not instantiate Browser: " + e.getMessage ()); return; } // Adapt browser size to shell size browser.setBounds(shell.getClientArea()); // Listens for page loading status. browser.addProgressListener(new ProgressListener() { public void changed(ProgressEvent event) { } public void completed(ProgressEvent event) { // When a page is loaded, decrement the latch, // which count will be 0 after this call. latch.countDown(); } }); // Realease the initialization latch, which has value 1, // so after this call its value will be 0. initLatch.countDown(); while (!shell.isDisposed()) { if (!display.readAndDispatch()) { display.sleep(); } } System.exit(0); } }.start(); try { // Waits until the initialization latch is released. initLatch.await(); } catch (InterruptedException e) { Thread.interrupted(); } // Creates the XPath evaluator XPCOM component Mozilla moz = Mozilla.getInstance(); nsIComponentManager componentManager = moz.getComponentManager(); xpathEval = (nsIDOMXPathEvaluator) componentManager.createInstanceByContractID( NS_IDOMXPATHEVALUATOR_CONTRACTID, null, nsIDOMXPathEvaluator.NS_IDOMXPATHEVALUATOR_IID); } /** * Loads an URL into the browser and waits until the page is * totally loaded. * @param url * @throws SimpleBrowserException */ public void go(final String url) throws SimpleBrowserException { // Creates a latch with count 1 latch = new CountDownLatch(1); // Uses the SWT event thread to execute the method to // load an URL in the browser. display.syncExec(new Runnable() { public void run() { browser.setUrl(url); } }); // Waits for the finish of the page loading, or for a given // timeout in case that the loading doesn't finish in a // reasonable time. boolean timeout = waitLoad(defaultTimeout); if (timeout) { throw new SimpleBrowserException("Timeout waiting page loading."); } } /** * * @return an W3C HTML Document implementation corresponding to * the Mozilla DOM HTML document currently loaded in the browser. * @throws SimpleBrowserException */ public HTMLDocument getW3CDocument() { // System.out.println("El browser es " + browser.toString()); class DocumentGetter implements Runnable { private nsIDOMHTMLDocument htmldoc; public void run(){ nsIWebBrowser webBrowser = (nsIWebBrowser)browser.getWebBrowser(); if (webBrowser == null) { System.out.println("Could not get the nsIWebBrowser from the Browser widget"); } nsIDOMWindow dw = webBrowser.getContentDOMWindow(); nsIDOMDocument nsDoc = dw.getDocument(); htmldoc = (nsIDOMHTMLDocument) nsDoc .queryInterface (nsIDOMHTMLDocument.NS_IDOMHTMLDOCUMENT_IID); } public nsIDOMHTMLDocument getHtmldoc() { // TODO Auto-generated method stub return htmldoc; }} DocumentGetter dg = new DocumentGetter(); display.syncExec(dg); return HTMLDocumentImpl.getDOMInstance(dg.getHtmldoc()); } /** * * @param xpath * @return a list with the nodes corresponding to a given xpath. * @throws SimpleBrowserException */ public List<Node> xpathNodes(String xpath) { return xPathNodes(xpath, ((HTMLDocumentImpl) getW3CDocument()).getInstance()); } /** * * @param <T> * @param xpath * @param nodeClass * @return a list of <code>nodeClass</code> nodes corresponding * to a given xpath. * @throws SimpleBrowserException */ public <T extends Node> List<T> xpathNodes(String xpath, Class<T> nodeClass) { return (List<T>)xPathNodes(xpath, ((HTMLDocumentImpl) getW3CDocument()).getInstance()); } private boolean waitLoad(long millis) { try { // Uses the latch, created by 'go' method to wait for // the finish of the page loading (it will occurs when // our 'progressListener' receives a event for its method // 'completed'), or for a given timeout in case that the // loading doesn't finish in a reasonable time. boolean timeout; timeout = !latch.await(millis,TimeUnit.MILLISECONDS); if (timeout) { // If the timeout expired, then we will stop // page loading. display.syncExec(new Runnable() { public void run() { browser.stop(); } }); // Waits for the loading is stopped latch.await(millis,TimeUnit.MILLISECONDS); } return timeout; } catch (InterruptedException e) { throw new Error(e); } } private List<Node> xPathNodes(String xpath, nsIDOMNode context) { // Obtain the Mozilla DOM HTML document HTMLDocumentImpl documentImpl = (HTMLDocumentImpl) getW3CDocument(); nsIDOMHTMLDocument document = documentImpl.getInstance(); // Creates a name space resolver for the document nsIDOMXPathNSResolver res = xpathEval.createNSResolver(document); List<Node> resultNodes = null; // Evaluates given XPath in a given context, using the resolver created // for the current document as an ordered iterator nsISupports obj = xpathEval.evaluate(xpath, context, res, nsIDOMXPathResult.ORDERED_NODE_ITERATOR_TYPE, null); // Obtain the interface corresponding to the XPath XPCOM results object nsIDOMXPathResult result = (nsIDOMXPathResult) obj.queryInterface( nsIDOMXPathResult.NS_IDOMXPATHRESULT_IID); try { // Extract result nodes for the XPath and add them // to a standard List. resultNodes = getNodes(result); } catch(org.mozilla.xpcom.XPCOMException e){ throw e; } return resultNodes; } private <T> List<T> getNodes(nsIDOMXPathResult result) { List<T> nodes = new ArrayList<T>(); nsIDOMNode node; while((node = result.iterateNext()) != null){ // Use the functionality provided by the mozdom4java // (in our case, patched) library to obtain the corresponding // W3C implementation of a node. nodes.add((T)NodeFactory.getNodeInstance(node)); } return nodes; } public static void main(String[] args) { String xulrunnerPath = null; if ( args.length > 0 ) { xulrunnerPath = args[0]; } // Instantiate our simple web browser SimpleBrowserWithXPath simpleBrowser = new SimpleBrowserWithXPath(xulrunnerPath); try{ // Load a page in the browser simpleBrowser.go("http://www.google.com"); // Obtain a list of nodes, without a concrete class, // because the XPath may return nodes of different // types, so we work with them in a generic way. List<Node> nodes = simpleBrowser.xpathNodes("//*"); for (Node node: nodes) { System.out.println("Node Type: " + node.getNodeName() + " -- Content: " + node.getTextContent()); } // Obtain a list of HTMLAnchorElements, because // we can be sure about the result of our XPath, // if it has any result, will be only of // HTMLAnchorElement type. for (HTMLAnchorElement a: simpleBrowser.xpathNodes( "//a", HTMLAnchorElement.class)) { System.out.println("Anchor: " + a.getHref()); } } catch (SimpleBrowserException e) { System.err.println("Problems calling go method."); e.printStackTrace(); } Runtime.getRuntime().halt(0); } }

fancyerII

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
XULRunner with Java: JavaXPCOM Tutorial 4

7. 用JavaXPCOM来自动浏览网页在这一节里我们将解决一些自动浏览网页中的一些问题，尝试把一下通用的任务抽象成人类可读的方法，因此你可以轻易的阅读代码并知道它的功能。我们将构建一个Web Browser来加载网页，点击按钮或者超链接，使用XPath来抽取一下有用的信息。在每一个小节里面，我们都将在我们的浏览器里增加新的功能，因此在最后，我们将有一个Web Brows
复制链接

扫一扫

专栏目录