Java通过HttpURLConnection访问页面并解析HTML文件元素;
依赖包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
GET带Cookie
/**
* GET请求
* @param url
* @param cookie
* @return
*/
public String get(String url,List<String> cookie) {
log.info("start url:{}",url);
BufferedReader in = null;
StringBuilder builder = new StringBuilder();
try {
URL realUrl = new URL(url);
// 打开和URL之间的连接
HttpURLConnection conn = (HttpURLConnection)realUrl.openConnection();
// 设置通用的请求属性
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent", USER_AGENT);
if (!CollectionUtils.isEmpty(cookie))
conn.setRequestProperty("Cookie", Joiner.on(";").join(cookie));
conn.connect();
if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
builder.append(line).append("\n");
}
}
log.info(String.format("end get result: {}", builder.toString()));
} catch (Exception e) {
e.printStackTrace();
log.error("HttpUtil.get error.", e);
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
log.error("HttpUtil.get finally error.", e2);
}
}
return builder.toString();
}
HTML元素解析
/** * 获取body元素 * @param htmlStr * @return */ public static Element bodyByHtmlStr(String htmlStr) { //解析字符串为Document对象 Document doc = Jsoup.parse(htmlStr); //获取body元素,获取class="fc"的table元素 return doc.body(); }
/**
* 读取本地html文件里的html代码
* @return
*/
public static String toHtmlString(File file) {
// 获取HTML文件流
StringBuffer htmlSb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(file), "unicode"));
while (br.ready()) {
htmlSb.append(br.readLine());
}
br.close();
// 删除临时文件
//file.delete();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// HTML文件字符串
String htmlStr = htmlSb.toString();
// 返回经过清洁的html文本
return htmlStr;
}public static void main(String[] args){
String filePath = "D:\\test\\test.html";
//读取.html文件为字符串
String htmlStr = toHtmlString(new File(filePath));
//解析字符串为Document对象
Document doc = Jsoup.parse(htmlStr);
//获取body元素,获取class="fc"的table元素
Elements table = doc.body().getElementsByClass("fc");
//获取tbody元素
Elements children = table.first().children();
//获取tr元素集合
Elements tr = children.get(0).getElementsByTag("tr");
//遍历tr元素,获取td元素,并打印
for(int i=0; i<tr.size(); i++){
Element e1 = tr.get(i);
Elements td = e1.getElementsByTag("td");
for(int j=0; j<td.size(); j++){
String value = td.get(j).text();
System.out.print(" "+value);
}
System.out.println();
}}

1693

被折叠的 条评论
为什么被折叠?



