今天解析了一下HTML页面,分享给大家,顺便做个备忘。
需求:
1.能抓取HTML页面中input标签和checkbox标签的值
2.能抓取checkbox标签选中的value和text值
HTML页面样式为:
HTML脚本为:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>测试</title>
</head>
<body>
<form name="test" id="test" method="post" action="test.do">
<div>
<table width="865" border="0" cellspacing="0" cellpadding="0" >
<tr>
<td height="30" colspan="4" align="center" ></td>
</tr>
<tr><td height="34" align="right" >区域:</td><td colspan="3" ><!--调用时必须复制部分 -->
<span id="f"></span><span id="floading"></span><span id="ferr"></span>地址:<font color="red">广东省-中山市-古镇</font></td></tr>
<tr>
<td width="102" height="26" align="right" >商城:</td>
<td width="178" align="left" >
<input type="text" name="tmallname" value="灯饰旗舰店" readonly="readonly"/><span id="checkTmallname" style="color:#f00;"></span>
</td><td width="100" align="right" >公司:</td>
<td width="485" align="left" >
<input type="text" name="companyname" value="中山市家居用品有限公司" readonly="readonly"/>
</td>
</tr><tr>
<td width="102" height="26" align="right" >姓名:</td>
<td width="178" align="left" >
<input type="text" name="truename" value="姜先生" readonly="readonly"/>
</td><td width="100" align="right" >电话:</td>
<td width="485" align="left" >
<input type="text" name="telphone" value="4000010110" readonly="readonly"/>
</td>
</tr><tr>
<td width="102" height="26" align="right" >手机:</td>
<td width="178" align="left" >
<input type="text" name="mobile" value="13012345678" readonly="readonly"/>
</td><td width="100" align="right" ></td>
<td width="485" align="left" >
</td>
</tr><tr>
<td width="102" height="40" align="right" >QQ:</td>
<td width="178" align="left" >
<input type="text" name="qq" id="date12" value="" readonly="readonly">
</td>
<td width="100" align="right" >旺旺:</td><td width="485" align="left" ><input type="text" name="wangwang" readonly="readonly" id="date12" value="灯饰旗舰店" ></td>
</tr>
<tr>
<td width="102" height="40" align="right" >邮箱:</td>
<td width="178" align="left" >
<input type="text" name="email" id="email" value="" readonly="readonly" >
</td>
<td width="100" align="right" >地址:</td><td width="485" align="left" ><textarea name="address" id="address" cols="50" rows="2">广东省中山市古镇**楼</textarea></td>
</tr>
<tr>
<td width="102" height="40" align="right" colspan="">维修:</td>
<td align="left" colspan="2">
<input name="class" type="checkbox" readonly="readonly" value="1" checked="checked">灯具</input>
<input name="class" type="checkbox" readonly="readonly" value="2">卫浴</input>
</td>
</tr></table></div>
</form>
<br><br><br><br><br><br><br><br><br><br><br><br><br><br></div>
</body>
</html>
解析代码为:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;public class ParseHTML {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String fileName=ParseHTML.class.getResource("/").getPath() +"testpage.html";
parseHtml(readFileByLines(fileName));
}
public static String readFileByLines(String fileName) {
File file = new File(fileName);
BufferedReader reader = null;
String result = "";
try {
//System.out.println("以行为单位读取文件内容,一次读一整行:");
InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");
reader =new BufferedReader(isr);
String tempString = "";
int line = 1;
while ((tempString = reader.readLine()) != null) {
result+=tempString;
line++;
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
return result;
}
private static void parseHtml(String html){
Document doc = Jsoup.parse(html);
Elements rows = doc.select("table").get(0).select("tr");
if (rows.size() == 1) {
System.out.println("没有结果");
}else {
System.out.print("地址:" + rows.get(1).select("font").get(0).text());
System.out.print("//公司:" + rows.get(2).select("input").get(1).attr("value"));
System.out.print("//商城:" + rows.get(2).select("input").get(0).attr("value"));
System.out.print("/姓名:" + rows.get(3).select("input").get(0).attr("value"));
System.out.print("//电话:" + rows.get(3).select("input").get(1).attr("value"));
System.out.print("//手机:" + rows.get(4).select("input").get(0).attr("value")+"//");
Elements inputs=rows.get(7).select("input");
for(Element input:inputs){
if("checked".equals(input.attr("checked"))){
String value=input.attr("value");
System.out.print(value+"="+getText(value)+",");
}
}
System.out.println();
}
}
private static String getText(String value){
if("1".equals(value)){return "灯具";
}else if("2".equals(value)){return "卫浴";
}
return null;
}}
运行输出的结果为:
地址:广东省-中山市-古镇//公司:中山市家居用品有限公司//商城:灯饰旗舰店/姓名:姜先生//电话:4000010110//手机:13012345678//1=灯具,