Java爬虫篇
最近做了个项目,其中涉及到爬虫的功能,需要将省厅和市级的网站数据给爬下来展示在系统项目中,主要用到的技术就是Jsonp和json的解析,下面主要讲解下爬虫思路以及对数据中分页的处理(递归)。
- 爬虫网站
http://zwfw-new.hunan.gov.cn/onething/service/index.jsp
需要将这里的数据给爬下来。
2.根据分类查询去爬取数据,这里我根据行政许可类型去获取数据。
3.通过F12分析数据来源
图上列表的数据通过分析是通过js动态拼接请求的,而子项是通过url直接请求获取的。
http://zwfw-new.hunan.gov.cn/onething/service/hnservicesdglinfo.jsp?info.jsp?type=xndtbm&typeid=62bab4bfbbc142a5bb4e2f61c68accb7&word=&areacode=439900000000&itemType=02&orgId=62bab4bfbbc142a5bb4e2f61c68accb7&charName=&minSeq=&typeCode=02&main=&dghy=&pager.offset=1&pager.desc=true
这是通过拼接的url
url通过Jsonp去请求获取,代码如下
//创建response请求连接
@Override
public DataResult syncBslc(int page, String s) {
System.out.println(s);
//循环遍历根据类型查找
StringBuffer sb = new StringBuffer();
int num = page * 10 - 10;
String ss = null;
if (page != 1) {
//判断是不是非数字分类,url会不同要处理
if (PATTERN.matcher(s).matches()) {
ss = "sdgl";
sb.append("http://zwfw-new.hunan.gov.cn/onething/service/hnservice" + ss + "info.jsp?type=xndtbm&typeid=62bab4bfbbc142a5bb4e2f61c68accb7&word=&areacode=439900000000&itemType=" + s + "&orgId=62bab4bfbbc142a5bb4e2f61c68accb7&charName=&minSeq=&typeCode=" + s + "&main=&dghy=&pager.offset=" + num + "&pager.desc=true");
} else {
ss = s;
sb.append("http://zwfw-new.hunan.gov.cn/onething/service/hnservice" + ss + "info.jsp?type=xndtbm&typeid=62bab4bfbbc142a5bb4e2f61c68accb7&word=&areacode=439900000000&itemType=" + ss + "&orgId=62bab4bfbbc142a5bb4e2f61c68accb7&charName=&minSeq=&typeCode=" + ss + "&main=&dghy=&pager.offset=" + num + "&pager.desc=true");
}
} else {
//判断是不是非数字分类,url会不同要处理
if (PATTERN.matcher(s).matches()) {
ss = "sdgl";
sb.append("http://zwfw-new.hunan.gov.cn/onething/service/hnservice" + ss + "info.jsp?type=xndtbm&typeid=62bab4bfbbc142a5bb4e2f61c68accb7&word=&areacode=439900000000&itemType=" + s + "&orgId=62bab4bfbbc142a5bb4e2f61c68accb7&charName=&minSeq=&typeCode=" + s + "&main=&dghy=&pager.offset=" + num + "&pager.desc=true");
} else {
ss = s;
sb.append("http://zwfw-new.hunan.gov.cn/onething/service/hnservice" + ss + "info.jsp?type=xndtbm&typeid=62bab4bfbbc142a5bb4e2f61c68accb7&word=&areacode=439900000000&itemType=power&orgId=62bab4bfbbc142a5bb4e2f61c68accb7&charName=&minSeq=&typeCode=power&main=&dghy=&pager.offset=" + num + "&pager.desc=true");
}
}
Connection.Response response = null;
try {
response = Jsoup.connect(sb.toString()).execute();
} catch (IOException e) {
return DataResult.fail("爬取出错,请稍后再试");
}
//爬取
List<GovBusinessProcessEntity> govBusinessProcessEntities = new ArrayList<>();
GovBusinessProcessEntity govBusinessProcessEntity = null;
String html = response.body();
Document document = Jsoup.parse(html);
Elements elements = document.getElementsByTag("tr");
//根据页码判断移除对象第一页移除第一个,大于第一页的移除前后面一位
if (num > 0) {
elements.remove(0);
// elements.remove(elements.size()-1);
} else if ("power".equals(s)) {
elements.remove(0);
elements.remove(elements.size() - 1);
} else {
elements.remove(0);
}
// elements.remove(elements.size() - 1);
if (!CollectionUtils.isEmpty(elements)) {
if (!CollectionUtils.isEmpty(elements)) {
String content = null;
String doUnit = null;
String level = null;
String url = null;
String approveId = null;
int count = 0;
for (Element cur : elements) {
Document curDocument = Jsoup.parse(cur.html());
//服务事项爬取
Elements td = curDocument.getElementsByTag("a");
for (Element t : td) {
govBusinessProcessEntity = new GovBusinessProcessEntity();
//判断服务事项有没有子项
if (cur.getElementsByClass("waiLi childUl").size() == 0) {
String id = UUID.randomUUID().toString();
content = t.text();
doUnit = cur.getElementsByTag("td").get(2).text();
level = cur.getElementsByTag("td").get(3).text();
approveId = t.attr("onclick").substring(t.attr("onclick").indexOf("'") + 1, t.attr("onclick").indexOf("'", 17));
url = "http://zwfw-new.hunan.gov.cn/onething/service/serviceguideck.jsp?approve_id=" + approveId + "&type=xndtbm&dghy=&cscjwt=&ygzw=&areacode=439900000000";
govBusinessProcessEntity.setContent(content);
govBusinessProcessEntity.setDoUnit(doUnit);
govBusinessProcessEntity.setLevel(level);
govBusinessProcessEntity.setUrl(url);
govBusinessProcessEntity.setParentId("1");
//存type
govBusinessProcessEntity.setType(s);
govBusinessProcessEntities.add(govBusinessProcessEntity);
break;
} else {
count++;
//id子项的父id
String id = UUID.randomUUID().toString();
content = t.text();
if (StringUtils.isBlank(cur.getElementsByClass("sp3").text())) {
doUnit = "湖南省应急管理厅";
} else {
doUnit = cur.getElementsByClass("sp3").get(0).text();
}
if (StringUtils.isBlank(cur.getElementsByClass("sp3").text())) {
level = "省级/直属";
} else {
level = cur.getElementsByClass("sp3").get(0).text();
}
approveId = t.attr("onclick").substring(t.attr("onclick").indexOf("'") + 1, t.attr("onclick").indexOf("'", 17));
url = "http://zwfw-new.hunan.gov.cn/onething/service/serviceguideck.jsp?approve_id=" + approveId + "&type=xndtbm&dghy=&cscjwt=&ygzw=&areacode=439900000000";
govBusinessProcessEntity.setContent(content);
govBusinessProcessEntity.setDoUnit(doUnit);
govBusinessProcessEntity.setLevel(level);
govBusinessProcessEntity.setUrl(url);
govBusinessProcessEntity.setParentId("1");
govBusinessProcessEntity.setId(id);
govBusinessProcessEntity.setType(s);
govBusinessProcessEntities.add(govBusinessProcessEntity);
//子项爬虫
StringBuffer sx = new StringBuffer();
if ("power".equals(s)) {
url = "http://zwfw-new.hunan.gov.cn/onething/service/serviceguidecklistdo.jsp?approve_id=" + approveId + "&type=xndtbm&typeid=62bab4bfbbc142a5bb4e2f61c68accb7&word=&type_code=01&transact_level=&make_transaction=&num=" + (count + 1);
} else {
url = "http://zwfw-new.hunan.gov.cn/onething/service/serviceguidecklistdo.jsp?approve_id=" + approveId + "&type=xndtbm&typeid=62bab4bfbbc142a5bb4e2f61c68accb7&word=&type_code=" + s + "&transact_level=&make_transaction=&num=" + count;
}
sx.append(url);
try {
response = Jsoup.connect(sx.toString()).execute();
} catch (IOException e) {
return DataResult.fail("爬取出错,请稍后再试");
}
String ht = response.body();
Document doc = Jsoup.parse(ht);
Elements elementsByClass = doc.getElementsByClass("smallLi clearfix li-hover");
// Elements els = curDocument.getElementsByTag("ul");
for (Element e : elementsByClass) {
content = e.getElementsByClass("sp1").get(0).text() + e.getElementsByClass("sp2").get(0).text();
doUnit = e.getElementsByClass("sp3").get(0).text();
level = e.getElementsByClass("sp3").get(1).text();
approveId = e.getElementsByClass("sp2").attr("onclick").substring(e.getElementsByClass("sp2").attr("onclick").indexOf("'") + 1, e.getElementsByClass("sp2").attr("onclick").indexOf("'", 14));
url = "http://zwfw-new.hunan.gov.cn/onething/service/serviceguideck.jsp?approve_id=" + approveId + "&type=xndtbm&dghy=&cscjwt=&ygzw=&areacode=439900000000";
GovBusinessProcessEntity entity = new GovBusinessProcessEntity();
entity.setContent(content);
entity.setDoUnit(doUnit);
entity.setLevel(level);
entity.setType(s);
entity.setUrl(url);
entity.setParentId(id);
entity.setId(UUID.randomUUID().toString());
govBusinessProcessEntities.add(entity);
}
break;
}
}
}
//批量新增
this.saveBatch(govBusinessProcessEntities);
}
}
常用的Jsonp的api介绍:
- 请求执行
Jsoup.connect(sb.toString()).execute();
- 获取字符的html内容
String html = response.body();
- 将字符串集转化为Document对象,后面通过ID、类、标签等去获取节点
Document document = Jsoup.parse(html);
常用获取节点方式
//通过Id去获取Element对象
Element element = document.getElementById("tr");
//通过Tag标签去获取Element对象集合
Elements elements = document.getElementsByTag("tr");
//通过Class去获取Element对象集合
Elements elements = document.getElementsByClass("tr");
分页数据的处理
正常通过上述的方法可以爬取当页的数据,对于分页的数据我们则采取递归的形式循环判断当前页面的页码是否大于总页数,递归调用。
//解析分页数据
Elements pageTotal = document.getElementsByClass("el-pager");
int maxPage = 0;
for (Element e : pageTotal) {
maxPage = Integer.parseInt(document.getElementsByClass("total").text().substring(document.getElementsByClass("total").text().indexOf("共") + 2, document.getElementsByClass("total").text().indexOf("页") - 1));
}
int curPage = Integer.parseInt(document.getElementsByClass("number active").text());
log.info("当前第" + page + "页;共" + (maxPage) + "页");
//如果后面还有分页,递归
if (page < maxPage) {
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
int next = page + 1;
return syncBslc(next, s);
}