这几天在学习Java解析xml,突然想到Dom能不能解析html,结果试了半天行不通,然后就去查了一些资料,发现很多人都在用Jsoup解析html文件,然后研究了一下,写了一个简单的实例,感觉还有很多地方需要润色,在这里分享一下我的实例,欢迎交流指教!后续想通过Java把数据导入到Excel或者生成一个报表!
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**从智联招聘获取招聘信息
* @url 智联招聘网站链接(建议不要更改)
* @city 搜索工作的城市
* @keywrods 搜索工作的相关关键字
*/
public class JsoupHtml {
private String url="http://sou.zhaopin.com/jobs/searchresult.ashx?jl="; //智联招聘网站
private String city="西安"; //搜索工作的城市
private String keywords="java"; //搜索工作的关键字
public JsoupHtml(String city,String keywords){
this.city=city;
this.keywords =keywords;
}
public void getZhiLianWork(){
try {
for (int i=0;i<10;i++) {
System.out.println("*********开始遍历第"+(i+1)+"页的求职信息*********");
Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+(i+1)+"&isadv=0").get();
Element content = doc.getElementById("newlist_list_content_table");
Elements zwmcEls = content.getElementsByClass("zwmc");
Elements gsmcEls = content.getElementsByClass("gsmc");
Elements zwyxEls = content.getElementsByClass("zwyx");
Elements gzddEls = content.getElementsByClass("gzdd");
Elements gxsjEls = content.getElementsByClass("gxsj");
for(int j = 0;j<zwmcEls .size();j++){
System.out.println(
zwmcEls.get(j).tagName("a").text()+"*****"+gsmcEls.get(j).tagName("a").text()+
"*****"+zwyxEls.get(j).tagName("a").text()+"*****"+gzddEls.get(j).tagName("a").text()+
"*****"+gxsjEls.get(j).tagName("a").text());
System.out.println();
}
System.out.println("*********结束遍历第"+(i+1)+"页的求职信息*********");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
JsoupHtml jHtml = new JsoupHtml("上海", "java");
jHtml.getZhiLianWork();
}
}
更新源代码,支持生成html表格:
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupHtml {
public static void main(String[] args) {
try {
String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?";
String city ="西安";
String keywords = "java";
BufferedWriter bWriter = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("output.html"),"utf-8"));
bWriter.write("");
File input = new File("input.html");
Document doc2 = Jsoup.parse(input, "UTF-8", "");
Element table = doc2.getElementById("workinfo");
table.text("");
Element theader = table.appendElement("tr");
theader.appendElement("th").text("序号");
theader.appendElement("th").text("职位名称");
theader.appendElement("th").text("公司名称");
theader.appendElement("th").text("职位月薪");
theader.appendElement("th").text("工作地点");
theader.appendElement("th").text("发布日期");
for(int page=0;page<10;page++){
Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+page).get();
Element content = doc.getElementById("newlist_list_content_table");
Elements zwmcEls = content.getElementsByClass("zwmc");
Elements gsmcEls = content.getElementsByClass("gsmc");
Elements zwyxEls = content.getElementsByClass("zwyx");
Elements gzddEls = content.getElementsByClass("gzdd");
Elements gxsjEls = content.getElementsByClass("gxsj");
for(int i = 1;i<zwmcEls .size();i++){
Element tr =table.appendElement("tr");
tr.appendElement("td").text((page+1)+"-"+i);
tr.appendElement("td").text(zwmcEls.get(i).tagName("a").text());
tr.appendElement("td").text(gsmcEls.get(i).tagName("a").text());
tr.appendElement("td").text(zwyxEls.get(i).tagName("a").text());
tr.appendElement("td").text(gzddEls.get(i).tagName("a").text());
tr.appendElement("td").text(gxsjEls.get(i).tagName("a").text());
}
}
System.out.println(doc2.html());
bWriter.write(doc2.html());
bWriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
output.html模板:
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="Generator" content="EditPlus®">
<meta name="Author" content="">
<meta name="Keywords" content="">
<meta name="Description" content="">
<title>智联工作信息</title>
<style>
body{margin:0;padding:0;}
.header{height:100px;width:100%;background:#39c;color:#fff;text-align:center;line-height:100px;font-size:40px;
font-family:"微软雅黑";}
.body{width:100%;background:#fff;}
.body table{width:90%;margin:0 auto;color:#2e2e2e;border:1px solid #cad9ea; border-collapse: collapse; }
.body table th,td{min-width:50px;max-width:300px;}
.feeter{height:30px;width:100%;background:#39c;color:#fff;text-align:center;line-height:30px;font-size:14px;
font-family:"微软雅黑";}
</style>
</head>
<body>
<div class="header">智联工作信息</div>
<div class="body">
<table class="work" border="1">
<tbody id="workinfo">
</tbody>
</table>
</div>
<div class="feeter">版权所有 翻版必究@2018 Joker</div>
</body>
</html>