html页面用什么表达式取值,使用HTMLUNIT从标签之间的HTML页面提取数据

这是我遵循的步骤(不是唯一的解决方案)通过parseHtml方法使用伪URL解析字符串

通过xpath获取第二张表

使用双嵌套循环进行迭代(用于和迭代器-正确附加分隔符-)

ExtractTableData:

import java.net.URL;

import com.gargoylesoftware.htmlunit.StringWebResponse;

import com.gargoylesoftware.htmlunit.WebClient;

import com.gargoylesoftware.htmlunit.html.HTMLParser;

import com.gargoylesoftware.htmlunit.html.HtmlPage;

import com.gargoylesoftware.htmlunit.html.HtmlTable;

import com.gargoylesoftware.htmlunit.html.HtmlTableRow;

import com.gargoylesoftware.htmlunit.html.HtmlTableRow.CellIterator;

public class ExtractTableData {

public static void main(String[] args) throws Exception {

String html = "

\n" + "                       
\n"

+ "                            Home\n"

+ "                            |\n"

+ "                            Queues\n"

+ "                            |\n"

+ "                            Topics\n"

+ "                            |\n"

+ "                            Subscribers\n"

+ "                            |\n"

+ "                            Connections\n"

+ "                            |\n"

+ "                            Network\n"

+ "                            |\n"

+ "                             Scheduled\n"

+ "                            |\n" + "                           

+ "                               title=\"Send\">Send

\n" + "                       
\n"

+ "                       

\n"

+ "                           

+ "                               title=\"Get help and support using Apache ActiveMQ\">Support

\n"

+ "                       

\n" + "                   
\n" + "\n"

+ "                   

+ "                           

\n"

+ "                               

\n"

+ "                                   

\n" + "\n" + "\n"

+ "

Welcome!

\n" + "\n" + "

\n"

+ "Welcome to the Apache ActiveMQ Console of localhost (ID:TOOLCONTROLPJX526-524666-65544585445-2:3)\n"

+ "

\n" + "\n" + "

\n"

+ "You can find more information about Apache ActiveMQ on the Apache ActiveMQ Site\n"

+ "

\n" + "\n" + "

Broker

\n" + "\n" + "\n" + "

+ "       

Name\n" + "        localhost\n" + "    \n" + "    \n"

+ "       

Version\n" + "        5.13.3\n" + "    \n" + "    \n"

+ "       

ID\n" + "        ID:TOOLCONTROLPJX526-524666-65544585445-2:3\n"

+ "   

\n" + "    \n" + "        Uptime\n"

+ "       

17 days 13 hours\n" + "    \n" + "    \n"

+ "       

Store percent used\n" + "        19\n" + "    \n"

+ "   

\n" + "        Memory percent used\n" + "        0\n"

+ "   

\n" + "    \n" + "        Temp percent used\n" + "        0\n"

+ "   

\n" + "";

WebClient webClient = new WebClient();

HtmlPage page = HTMLParser.parseHtml(new StringWebResponse(html, new URL("http://dummy.url.for.parsing.com/")),

webClient.getCurrentWindow());

final HtmlTable table = (HtmlTable) page.getByXPath("//table").get(1);

for (final HtmlTableRow row : table.getRows()) {

CellIterator cellIterator = row.getCellIterator();

if (cellIterator.hasNext()) {

System.out.print(cellIterator.next().asText());

while (cellIterator.hasNext()) {

System.out.print(":" + cellIterator.next().asText());

}

}

System.out.println();

}

}

}

输出:

Name:localhost

Version:5.13.3

ID:ID:TOOLCONTROLPJX526-524666-65544585445-2:3

Uptime:17 days 13 hours

Store percent used:19

Memory percent used:0

Temp percent used:0

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值